# Chapter 3: Building Your First Distributed Application With Ray Core


You can run this notebook directly in
[Colab](https://colab.research.google.com/github/maxpumperla/learning_ray/blob/main/notebooks/ch_03_core_app.ipynb).

The book has been written for Ray 2.2.0,which at the time of writing has not
officially been released yet. If you are reading this and this version is already
available, you can install it using `pip install ray==2.2.0`. If not, you can
use a nightly wheel (here for Python 3.7 on Linux):

In [None]:
! pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl

Should you not run this notebook in Colab and need another type of wheel, please
refer to Ray's [installation instructions for nightlies](https://docs.ray.io/en/latest/ray-overview/installation.html#install-nightlies).

In [None]:
# tag::discrete_actions[]
import random


class Discrete:
    def __init__(self, num_actions: int):
        """ Discrete action space for num_actions.
        Discrete(4) can be used as encoding moving in
        one of the cardinal directions.
        """
        self.n = num_actions

    def sample(self):
        return random.randint(0, self.n - 1)  # <1>


space = Discrete(4)
print(space.sample())  # <2>
# end::discrete_actions[]

In [None]:
# tag::init_env[]
import os


class Environment:
    def __init__(self,  *args, **kwargs):
        self.seeker, self.goal = (0, 0), (4, 4)  # <1>
        self.info = {'seeker': self.seeker, 'goal': self.goal}

        self.action_space = Discrete(4)  # <2>
        self.observation_space = Discrete(5*5)  # <3>
# end::init_env[]

# tag::env_helpers[]
    def reset(self):  # <1>
        """Reset seeker position and return observations."""
        self.seeker = (0, 0)

        return self.get_observation()

    def get_observation(self):
        """Encode the seeker position as integer"""
        return 5 * self.seeker[0] + self.seeker[1]  # <2>

    def get_reward(self):
        """Reward finding the goal"""
        return 1 if self.seeker == self.goal else 0  # <3>

    def is_done(self):
        """We're done if we found the goal"""
        return self.seeker == self.goal  # <4>
# end::env_helpers[]

# tag::env_step[]
    def step(self, action):
        """Take a step in a direction and return all available information."""
        if action == 0:  # move down
            self.seeker = (min(self.seeker[0] + 1, 4), self.seeker[1])
        elif action == 1:  # move left
            self.seeker = (self.seeker[0], max(self.seeker[1] - 1, 0))
        elif action == 2:  # move up
            self.seeker = (max(self.seeker[0] - 1, 0), self.seeker[1])
        elif action == 3:  # move right
            self.seeker = (self.seeker[0], min(self.seeker[1] + 1, 4))
        else:
            raise ValueError("Invalid action")

        obs = self.get_observation()
        rew = self.get_reward()
        done = self.is_done()
        return obs, rew, done, self.info  # <1>
# end::env_step[]

# tag::env_render[]
    def render(self, *args, **kwargs):
        """Render the environment, e.g. by printing its representation."""
        os.system('cls' if os.name == 'nt' else 'clear')  # <1>
        grid = [['| ' for _ in range(5)] + ["|\n"] for _ in range(5)]
        grid[self.goal[0]][self.goal[1]] = '|G'
        grid[self.seeker[0]][self.seeker[1]] = '|S'  # <2>
        print(''.join([''.join(grid_row) for grid_row in grid]))  # <3>
# end::env_render[]

In [None]:
# tag::env_test[]
import time

environment = Environment()

while not environment.is_done():
    random_action = environment.action_space.sample()  # <1>
    environment.step(random_action)
    time.sleep(0.1)
    environment.render()  # <2>
# end::env_test[]

In [None]:
# tag::policy[]
import numpy as np

class Policy:

    def __init__(self, env):
        """A Policy suggests actions based on the current state.
        We do this by tracking the value of each state-action pair.
        """
        self.state_action_table = [
            [0 for _ in range(env.action_space.n)]
            for _ in range(env.observation_space.n)  # <1>
        ]
        self.action_space = env.action_space

    def get_action(self, state, explore=True, epsilon=0.1):  # <2>
        """Explore randomly or exploit the best value currently available."""
        if explore and random.uniform(0, 1) < epsilon:  # <3>
            return self.action_space.sample()
        return np.argmax(self.state_action_table[state])  # <4>
# end::policy[]

In [None]:
# tag::simulation[]
class Simulation(object):
    def __init__(self, env):
        """Simulates rollouts of an environment, given a policy to follow."""
        self.env = env

    def rollout(self, policy, render=False, explore=True, epsilon=0.1):  # <1>
        """Returns experiences for a policy rollout."""
        experiences = []
        state = self.env.reset()  # <2>
        done = False
        while not done:
            action = policy.get_action(state, explore, epsilon)  # <3>
            next_state, reward, done, info = self.env.step(action)  # <4>
            experiences.append([state, action, reward, next_state])  # <5>
            state = next_state
            if render:  # <6>
                time.sleep(0.05)
                self.env.render()

        return experiences
# end::simulation[]

In [None]:
# tag::naive_rollout[]
untrained_policy = Policy(environment)
sim = Simulation(environment)

exp = sim.rollout(untrained_policy, render=True, epsilon=1.0)  # <1>
for row in untrained_policy.state_action_table:
    print(row)  # <2>
# end::naive_rollout[]

In [None]:
# tag::update_policy[]
def update_policy(policy, experiences, weight=0.1, discount_factor=0.9):
    """Updates a given policy with a list of (state, action, reward, state)
    experiences."""
    for state, action, reward, next_state in experiences:  # <1>
        next_max = np.max(policy.state_action_table[next_state])  # <2>
        value = policy.state_action_table[state][action]  # <3>
        new_value = (1 - weight) * value + weight * \
                    (reward + discount_factor * next_max)  # <4>
        policy.state_action_table[state][action] = new_value  # <5>
# end::update_policy[]

In [None]:
# tag::train_policy[]
def train_policy(env, num_episodes=10000, weight=0.1, discount_factor=0.9):
    """Training a policy by updating it with rollout experiences."""
    policy = Policy(env)
    sim = Simulation(env)
    for _ in range(num_episodes):
        experiences = sim.rollout(policy)  # <1>
        update_policy(policy, experiences, weight, discount_factor)  # <2>

    return policy


trained_policy = train_policy(environment)  # <3>
# end::train_policy[]

In [None]:
# tag::evaluate_policy[]
def evaluate_policy(env, policy, num_episodes=10):
    """Evaluate a trained policy through rollouts."""
    simulation = Simulation(env)
    steps = 0

    for _ in range(num_episodes):
        experiences = simulation.rollout(policy, render=True, explore=False)  # <1>
        steps += len(experiences)  # <2>

    print(f"{steps / num_episodes} steps on average "
          f"for a total of {num_episodes} episodes.")

    return steps / num_episodes


evaluate_policy(environment, trained_policy)
# end::evaluate_policy[]

In [None]:
# tag::ray_policy_simulation[]
import ray

ray.init()


@ray.remote
def create_policy():
    env = Environment()
    return Policy(env)  # <1>


@ray.remote
class SimulationActor(Simulation):  # <2>
    """Ray actor for a Simulation."""
    def __init__(self):
        env = Environment()
        super().__init__(env)
# end::ray_policy_simulation[]

In [None]:
# tag::ray_training[]
@ray.remote
def update_policy_task(policy, experiences_list):
    """Remote Ray task for updating a policy with experiences in parallel."""
    [update_policy(policy, ray.get(xp)) for xp in experiences_list]  # <1>
    return policy


def train_policy_parallel(num_episodes=1000, num_simulations=4):
    """Parallel policy training function."""
    policy = create_policy.remote()  # <2>
    simulations = [SimulationActor.remote() for _ in range(num_simulations)]  # <3>

    for _ in range(num_episodes):
        experiences = [sim.rollout.remote(policy) for sim in simulations]  # <4>
        policy = update_policy_task.remote(policy, experiences)  # <5>

    return ray.get(policy)  # <6>
# end::ray_training[]

In [None]:
# tag::ray_evaluation[]
parallel_policy = train_policy_parallel()
evaluate_policy(environment, parallel_policy)
# end::ray_evaluation[]

# ![Task dependency](https://raw.githubusercontent.com/maxpumperla/learning_ray/main/notebooks/images/chapter_03/train_policy.png)