In [11]:
## ------------------------------------------------------------------------------

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import pickle

import abc
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1" 
import sys

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
import numpy as np
from numpy.random import exponential
import random
import pandas as pd

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts
from tf_agents.environments.wrappers import ActionRepeat
from tf_agents.environments import batched_py_environment
from tf_agents.environments import parallel_py_environment
from tf_agents.networks.actor_distribution_network import ActorDistributionNetwork
from tensorflow.keras.layers import Reshape
from tf_agents.agents.reinforce import reinforce_agent
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.metrics import tf_metrics
from tf_agents.eval.metric_utils import log_metrics
from tf_agents.drivers.dynamic_episode_driver import DynamicEpisodeDriver
from tf_agents.policies.random_tf_policy import RandomTFPolicy
from tf_agents.trajectories.trajectory import to_transition
from tf_agents.utils.common import function
from tf_agents.policies.policy_saver import PolicySaver
from tf_agents.trajectories import trajectory
from tf_agents.agents.ddpg import critic_network
from tf_agents.networks import network
from tf_agents.networks import encoding_network
import tf_agents.environments.gym_wrapper
from tf_agents.environments.gym_wrapper import GymWrapper
from gym.wrappers import frame_stack
from tf_agents.agents.sac import sac_agent

import logging

from collections import deque

import sys
sys.path.insert(0, '../Snake_Emulator')
from Snake_v7 import Snake
from Snake_v7 import SnakeEnv

In [2]:
class ShowProgress:
    def __init__(self, total):
        self.counter = 0
        self.total = total
    def __call__(self, trajectory):
        if not trajectory.is_boundary():
            self.counter += 1
        if self.counter % 100 == 0:
            print("\r{}/{}".format(self.counter, self.total), end="")
            
trial = {
"board_size": 8,
"food_reward": 1,
"death_reward": -1,
"step_reward": 0,
"reward_type": 3,
"food_reward_type": 1,
"counter_mode": 1,
"max_counter": 3,
"health_cutoff": 20,
"DRF_MC_high": 1,
"DRF_MC_low": 1,
"immortal": False,
"wall_penalty": -0.25,
"kill_step_reward": 0,
"food_spawn_mode": 1,
"conv_layer_params": None,
"fc_layer_params": [[100, 100, 100]],
"dropout_layer_params": None,
"optimizer_learning_rate": 2.5e-4,
"optimizer_decay": 0.95,
"optimizer_momentum": 0,
"optimizer_epsilon": 0.00001,
"epsilon_decay_steps": 500000,
"epsilon_final": 0.01,
"target_update_period": 20000,
"discount_factor": 0.99,
"init_replay_buffer": 200,
"dataset_sample_batch_size": 64,
"dataset_num_steps": 2,
"dataset_num_parallel_calls":3,
"n_iterations": 10000000
}
locals().update(trial)

critic_joint_fc_layer_params = (256, 256)

In [13]:
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
            
tf.random.set_seed(888)
env = SnakeEnv(BOARD_SIZE=int(board_size),
               MAX_HEALTH=100,
               FOOD_REWARD=food_reward,
               STEP_REWARD=step_reward,
               DEATH_REWARD=death_reward,
               KILL_STEP_REWARD = kill_step_reward,
               REWARD_TYPE = reward_type,
               FOOD_REWARD_TYPE = food_reward_type, 
               FOOD_SPAWN_MODE=food_spawn_mode,
               MAX_COUNTER=int(max_counter),
               HEALTH_CUTOFF=health_cutoff,
               DRF_MC_HIGH=DRF_MC_high,
               DRF_MC_LOW=DRF_MC_low,
               IMMORTAL=immortal,
               WALL_PENALTY=wall_penalty,
              SNAKE_PROTECT=False)

env = GymWrapper(frame_stack.FrameStack(env, 4))
tf_env = tf_py_environment.TFPyEnvironment(env)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

# Preprocessing layers
board_preprocessing = Sequential([
    keras.layers.Lambda(lambda obs: tf.cast(obs, np.float32)),
    keras.layers.Flatten()
])

crit_net = critic_network.CriticNetwork(
    (tf_env.observation_spec(), tf_env.action_spec()),
    observation_fc_layer_params=[100,100,100],
    action_fc_layer_params=[100,100,100],
    joint_fc_layer_params=critic_joint_fc_layer_params)

# Layers params are specified by local variables ovtained from DataFrame
act_net = ActorDistributionNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers=board_preprocessing,
    conv_layer_params=conv_layer_params,
    fc_layer_params=[100,100,100],
    batch_squash=False)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

# Create variable that counts the number of training steps
train_step = tf.Variable(0)
# Create optimizer 
optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=optimizer_learning_rate,
                                                decay=optimizer_decay, momentum=optimizer_momentum,
                                                epsilon=optimizer_epsilon, centered=True)

agent = sac_agent.SacAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    actor_network=act_net,
    critic_network=crit_net,
    actor_optimizer=optimizer,
    critic_optimizer=optimizer,
    alpha_optimizer=optimizer,
    target_update_tau=0.005,
    target_update_period=target_update_period,
    td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
    gamma=discount_factor,
    train_step_counter=train_step)

agent.initialize()
# Speed up as tensorflow function
agent.train = function(agent.train)

NotImplementedError: SacAgent does not currently support discrete actions. Action spec: BoundedTensorSpec(shape=(), dtype=tf.int64, name='action', minimum=array(0), maximum=array(3))
  In call to configurable 'SacAgent' (<function SacAgent.__init__ at 0x7fe2a7fa8620>)

In [16]:
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=10000)

# Create the observer that adds trajectories to the replay buffer
replay_buffer_observer = replay_buffer.add_batch
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

train_metrics = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(),
    tf_metrics.AverageEpisodeLengthMetric(),
]
logging.getLogger().setLevel(logging.INFO)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

collect_driver = DynamicEpisodeDriver(
    tf_env, # Env to play with
    agent.collect_policy, # Collect policy of the agent
    observers=[replay_buffer_observer] + train_metrics, # pass to all observers
    num_episodes=10) 

# Speed up as tensorflow function
collect_driver.run = function(collect_driver.run)

## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------
## ------------------------------------------------------------------------------

if __name__ == "__main__":

    # a) For storing data
    training_info = [[], [], [], []]
    def add_metrics(arr, train_metrics):
        arr[0].append(train_metrics[0].result().numpy())
        arr[1].append(train_metrics[1].result().numpy())
        arr[2].append(train_metrics[2].result().numpy())
        arr[3].append(train_metrics[3].result().numpy())
    
    # b) For training agent
    def train_agent(n_iterations):
        time_step = None
        # Get initial policy state
        policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size)
        # Create iterator over dataset and loop
        
        for iteration in range(100):
            # Pass current time step and policy state to get next time step and policy state
            collect_driver.run()
            experience = replay_buffer.gather_all()
            print(experience)
            train_loss = agent.train(experience)
            replay_buffer.clear()
            if iteration % 10 == 0:
                # This adds training data
                print('yeet')
                add_metrics(training_info, train_metrics)
    train_agent(n_iterations=n_iterations)
    
    # c) For storing frames
    def get_vid_frames(policy, filename, num_episodes=100, fps=2):
        frames = []
        for _ in range(num_episodes):
            time_step = tf_env.reset()
            frames.append(np.abs(env.get_board()))
            while not time_step.is_last():
                action_step = policy.action(time_step)
                time_step = tf_env.step(action_step.action)
                frames.append(np.abs(env.get_board()))
        return frames

    # Store Data
    df = pd.DataFrame(np.array(training_info).T, columns=['N_Ep', 'Env_Steps', 'Avf_RM', 'Avg_EPLM'])

    
    # Store Frames
    frames = get_vid_frames(agent.policy, "trained-agent")

        
    # Store Model
    my_policy = agent.policy

Trajectory(step_type=<tf.Tensor: shape=(1, 1538), dtype=int32, numpy=array([[0, 1, 1, ..., 1, 1, 2]], dtype=int32)>, observation=(<tf.Tensor: shape=(1, 1538, 8, 8, 2), dtype=int32, numpy=
array([[[[[0, 0],
          [0, 0],
          [0, 0],
          ...,
          [0, 0],
          [0, 0],
          [0, 0]],

         [[0, 0],
          [0, 0],
          [0, 0],
          ...,
          [0, 0],
          [0, 0],
          [0, 0]],

         [[0, 0],
          [0, 0],
          [0, 0],
          ...,
          [0, 0],
          [0, 0],
          [0, 0]],

         ...,

         [[0, 0],
          [0, 0],
          [0, 0],
          ...,
          [0, 0],
          [0, 0],
          [0, 0]],

         [[0, 0],
          [0, 0],
          [0, 0],
          ...,
          [0, 0],
          [0, 0],
          [0, 0]],

         [[0, 0],
          [0, 0],
          [0, 0],
          ...,
          [0, 1],
          [0, 0],
          [0, 0]]],


        [[[0, 0],
          [0, 0],
         

ValueError: Input 0 of layer EncodingNetwork/dense is incompatible with the layer: expected axis -1 of input shape to have value 131 but received input with shape [1, 201478]