# PacMan-v1 - SageMaker notebook

*By Michael Ludvig*

Import the required modules

In [None]:
import os
import sagemaker
from sagemaker.rl import RLEstimator, RLToolkit, RLFramework
from sagemaker_job.misc import get_execution_role, wait_for_s3_object

**Find out AWS resources**

In [None]:
# Figure out S3 bucket
sage_session = sagemaker.session.Session()
s3_bucket = sage_session.default_bucket()  
s3_output_path = 's3://{}/'.format(s3_bucket)
print("S3 bucket path: {}".format(s3_output_path))

# Figure out execution role
try:
    role = sagemaker.get_execution_role()
except:
    role = get_execution_role()

print("Using IAM role arn: {}".format(role))

**RL Estimator** - here the training happens

In [None]:
local_mode = True
estimator_wait = True

PRESETS = [
#    "PacMan_A3C",
#    "PacMan_ACER",
#    "PacMan_ClippedPPO",
#    "PacMan_DDQN_BatchRL",
#    "PacMan_DDQN_BCQ_BatchRL",
#    "PacMan_DFP",
    "PacMan_DQN",
#    "PacMan_Dueling_DDQN",
#    "PacMan_NEC",
#    "PacMan_NStepQ",
#    "PacMan_PAL",
#    "PacMan_PG",
#    "PacMan_QR_DQN",
#    "PacMan_Rainbow",
]

if local_mode:
    !/bin/bash ./sagemaker_job/setup.sh
    spot_kwargs = {}
else:
    instance_type = "ml.c5.xlarge"
    spot_kwargs = {
        'train_use_spot_instances': True,
        'train_max_wait': 1*3600,    # Max time waiting for spot instance
        'train_max_run': 1*3600,     # Max training run time
    }

for preset in PRESETS:
    estimator = RLEstimator(entry_point="train-coach.py",
                        source_dir="sagemaker_job",
                        dependencies=["gym_pacman"],
                        toolkit=RLToolkit.COACH,
                        toolkit_version='0.11.0',
                        framework=RLFramework.MXNET,
                        role=role,
                        train_instance_type=instance_type if not local_mode else 'local',
                        train_instance_count=1,
                        output_path=s3_output_path,
                        base_job_name=preset.replace('_', '-'),
                        hyperparameters = {
                            "RLCOACH_PRESET": preset,
                            "improve_steps": 10000,  # short training only for testing the next steps
                            "save_model": 1,
                            'rl.agent_params.algorithm.discount': 0.618,
                            'rl.agent_params.algorithm.beta_entropy': 0.04,
                            'rl.learning_rate': 0.002   # see sagemaker_job/train-coach.py for mapping
                        },
                        **spot_kwargs,
                    )

    estimator.fit(wait=estimator_wait)
    if not local_mode:
        print("Job name: {}".format(estimator._current_job_name))

**Figure out the reports names**

In [None]:
job_name=estimator._current_job_name
print("Job name: {}".format(job_name))

s3_url = "s3://{}/{}".format(s3_bucket,job_name)

if local_mode:
    output_tar_key = "{}/output.tar.gz".format(job_name)
else:
    output_tar_key = "{}/output/output.tar.gz".format(job_name)

intermediate_folder_key = "{}/output/intermediate/".format(job_name)
output_url = "s3://{}/{}".format(s3_bucket, output_tar_key)
intermediate_url = "s3://{}/{}".format(s3_bucket, intermediate_folder_key)

print("S3 job path: {}".format(s3_url))
print("Output.tar.gz location: {}".format(output_url))
print("Intermediate folder path: {}".format(intermediate_url))
    
tmp_dir = "/tmp/{}".format(job_name)
os.system("mkdir {}".format(tmp_dir))
print("Create local folder {}".format(tmp_dir))

**Plot training progress**

In [None]:
%matplotlib inline
import pandas as pd

csv_file_name = "worker_0.simple_rl_graph.main_level.main_level.agent_0.csv"
key = os.path.join(intermediate_folder_key, csv_file_name)
wait_for_s3_object(s3_bucket, key, tmp_dir, training_job_name=job_name)

csv_file = "{}/{}".format(tmp_dir, csv_file_name)
df = pd.read_csv(csv_file)
df = df.dropna(subset=['Training Reward'])

x_axis = 'Episode #'
y_axis = 'Training Reward'

df['group'] = df['Episode #'].floordiv(100)
avg = df.groupby('group')['Training Reward'].mean()
plt = avg.plot()
plt.set_xlabel('Episode x100')
plt.set_ylabel('Average Reward')

## Create SageMaker Predictor Endpoint

In [None]:
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.t2.medium',
                             entry_point='deploy-mxnet-coach.py')

## Testing setup

Emulate the Gym data structures

In [None]:
# This is from gym_pacman/PacMan_V1.py
from enum import IntEnum
import numpy as np

class BoardStatus(IntEnum):
    EMPTY = 0
    DOT = 1
    PACMAN = 2

class Action(IntEnum):
    UP = 0
    DOWN = 1
    LEFT = 2
    RIGHT = 3

def get_cell_value(layer, position):
    return layer[position[0]][position[1]]

def set_cell_value(layer, position, value):
    layer[position[0]][position[1]] = value

def build_observation():
    return np.stack([
            layer_0_board,
            layer_1_pacman,
    ], axis=2).repeat(axis=0, repeats=repeat_multiplier).repeat(axis=1, repeats=repeat_multiplier)  # scale up the array to prevent "kernel is bigger than input" error

repeat_multiplier = 4
board_size=(5,5)

layer_0_board = np.full(board_size, BoardStatus.DOT, dtype=np.int32)
layer_1_pacman = np.full(board_size, BoardStatus.EMPTY, dtype=np.int32)
#position = np.array([np.random.randint(board_size[0]), np.random.randint(board_size[1])])
position = np.array([0, 0])    # PacMan position

set_cell_value(layer_0_board, position, BoardStatus.EMPTY)
set_cell_value(layer_1_pacman, position, BoardStatus.PACMAN)

In [None]:
# Provide the data in the same format as during training 
# From gym_pacman/PacMan_v1.py:_get_observation()
observation = np.stack([
    layer_0_board,
    layer_1_pacman,
], axis=2).repeat(axis=0, repeats=repeat_multiplier).repeat(axis=1, repeats=repeat_multiplier)
# print(repr(layer_0_board))
# print(repr(layer_1_pacman))
# print(repr(observation))
predictor.predict(data = observation)

In [None]:
#predictor.delete_endpoint()