In [1]:
#Need to restructure everything, adding through file upload for now
import numpy as np
import torch
from env import Env, RandomEnv
from reward import random_reward
from _types import Reward
from utils import timed
from canon import epic_canon

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
"""Illustrative rewards for gridworlds."""

import numpy as np

SPARSE_GOAL = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 1]])

CENTER_GOAL = np.array([[0, 0, 0], [0, 1, 0], [0, 0, 0]])

OBSTACLE_COURSE = np.array([[0, -1, -1], [0, 0, 0], [-1, -1, 4]])

CLIFF_WALK = np.array([[0, -1, -1], [0, 0, 0], [-4, -4, 4]])

MANHATTAN_FROM_GOAL = np.array([[4, 3, 2], [3, 2, 1], [2, 1, 0]])

ZERO = np.zeros((3, 3))

REWARDS = {
    # Equivalent rewards
    "sparse_goal": {"state_reward": SPARSE_GOAL, "potential": ZERO},
    "sparse_goal_shift": {"state_reward": SPARSE_GOAL + 1, "potential": ZERO},
    "sparse_goal_scale": {"state_reward": SPARSE_GOAL * 10, "potential": ZERO},
    "dense_goal": {"state_reward": SPARSE_GOAL, "potential": -MANHATTAN_FROM_GOAL},
    "antidense_goal": {"state_reward": SPARSE_GOAL, "potential": MANHATTAN_FROM_GOAL},
    # Non-equivalent rewards
    "transformed_goal": {
        # Shifted, rescaled and reshaped sparse goal.
        "state_reward": SPARSE_GOAL * 4 - 1,
        "potential": -MANHATTAN_FROM_GOAL * 3,
    },
    "center_goal": {
        # Goal is in center
        "state_reward": CENTER_GOAL,
        "potential": ZERO,
    },
    "dirt_path": {
        # Some minor penalties to avoid to reach goal.
        #
        # Optimal policy for this is optimal in `SPARSE_GOAL`, but not equivalent.
        # Think may come apart in some dynamics but not particularly intuitively.
        "state_reward": OBSTACLE_COURSE,
        "potential": ZERO,
    },
    "cliff_walk": {
        # Avoid cliff to reach goal. Same set of optimal policies as `obstacle_course` in
        # deterministic dynamics, but not equivalent.
        #
        # Optimal policy differs in sufficiently slippery gridworlds as want to stay on top line
        # to avoid chance of falling off cliff.
        "state_reward": CLIFF_WALK,
        "potential": ZERO,
    },
    "sparse_penalty": {
        # Negative of `sparse_goal`.
        "state_reward": -SPARSE_GOAL,
        "potential": ZERO,
    },
    "evaluating_rewards/Zero-v0": {
        # All zero reward function
        "state_reward": ZERO,
        "potential": ZERO,
    },
}

In [None]:
def epic_canon(reward: Reward, env: Env) -> Reward:
  D_s = get_state_dist(env)
  D_a = get_action_dist(env)
  if type(reward) is torch.Tensor:
    D_s, D_a = torch.tensor(D_s), torch.tensor(D_a)
  S = D_s[:, None, None]
  A = D_a[None, :, None]
  S_prime = D_s[None, None, :]

  potential = (reward * A * S_prime).sum(axis=(1, 2))

  term1 = env.discount * potential[None, None, :]
  term2 = potential[:, None, None]
  term3 = env.discount * (reward * S * A * S_prime).sum()

  return reward + term1 - term2 - term3

In [13]:
def epic(r1: Reward, r2: Reward, env: Env) -> float:
  r1_can = epic_canon(r1, env)
  r2_can = epic_canon(r2, env)

  r1_norm = r1_can / np.linalg.norm(r1_can.flatten(), 2)
  r2_norm = r2_can / np.linalg.norm(r2_can.flatten(), 2)

  return np.linalg.norm((r1_norm - r2_norm).flatten(), 2)

In [35]:
cliff_var = REWARDS["cliff_walk"]["state_reward"]
dense_var = REWARDS["dense_goal"]["state_reward"]

cliff_var_f = cliff_var.flatten()
dense_var_f = dense_var.flatten()

zero_s = np.zeros((9,5,9))
zero_d = np.zeros((9,5,9))

zero_s += cliff_var_f[:,None,None]
zero_d += dense_var_f[:,None,None]

print("Cliff Walk Reward Matrix:")
print(cliff_var)

print("Dense Walk Reward Matrix:")
print(dense_var)

Cliff Walk Reward Matrix:
[[ 0 -1 -1]
 [ 0  0  0]
 [-4 -4  4]]
Dense Walk Reward Matrix:
[[0 0 0]
 [0 0 0]
 [0 0 1]]


In [21]:
import numpy as np
from utils import softmax

class Env():
    def __init__(
        self,
        n_s: int,
        n_a: int,
        discount: float,
        init_dist: np.ndarray,
        transition_dist: np.ndarray,
    ):
        self.n_s = n_s
        self.states = np.arange(n_s)
        self.n_a = n_a
        self.actions = np.arange(n_a)
        self.discount = discount
        self.init_dist = init_dist
        self.transition_dist = transition_dist
class RandomEnv(Env):
    def __init__(self, n_s: int = 128, n_a: int = 16, discount: int = 0.9):
        init_dist = np.ones(n_s) / n_s
        thresh = 1 if n_s < 50 else (1.5 if n_s < 100 else 1.8)
        transition_dist = np.random.randn(n_s, n_a, n_s)
        transition_dist = np.where(transition_dist > thresh,
                             transition_dist, np.zeros_like(transition_dist) - 20)
                             
        transition_dist = softmax(transition_dist)
        super().__init__(n_s, n_a, discount, init_dist, transition_dist)


In [27]:
cliff_var = REWARDS['cliff_walk']['state_reward']
dense_var = REWARDS['dense_goal']['state_reward']

cliff_env = RandomEnv(n_s=9, n_a=5, discount=0.99)


In [37]:
epic(zero_s, zero_d, cliff_env)/2

0.3675592134610681