In [47]:
import numpy as np
from copy import deepcopy
from distance import canon
from _types import Reward
from env import Env, RandomEnv
from coverage_dist import get_state_dist, get_action_dist
from reward import random_reward

In [48]:
# this implementation go brrn't
def slow_epic(reward: Reward, env: Env):
  state_dist = get_state_dist(env)
  action_dist = get_action_dist(env)

  term1 = np.zeros((1, 1, env.n_s))
  for s_prime in range(env.n_s):
    for A, A_prob in enumerate(action_dist):
      for S_prime, S_prime_prob in enumerate(state_dist):
        prob = A_prob * S_prime_prob
        term1[0, 0, s_prime] += prob * (
          env.discount * reward[s_prime, A, S_prime]
        ) 

  term2 = np.zeros((env.n_s, 1, 1))
  for s in range(env.n_s):
    for A, A_prob in enumerate(action_dist):
      for S_prime, S_prime_prob in enumerate(state_dist):
        prob = A_prob * S_prime_prob
        term2[s, 0, 0] += prob * reward[s, A, S_prime]

  term3 = 0
  for S, S_prob in enumerate(state_dist):
    for A, A_prob in enumerate(action_dist):
      for S_prime, S_prime_prob in enumerate(state_dist):
        prob = S_prob * A_prob * S_prime_prob
        term3 += prob * env.discount * reward[S, A, S_prime]

  return term1, term2, term3

In [49]:
# non-deterministic toy env
n_s = 2
n_a = 2
discount = 0.9
init_dist = np.array([0.5, 0.5])
transition_dist = np.array([
  [
    [0.9, 0.1],
    [0.3, 0.7],
  ],
  [
    [0.2, 0.8],
    [0.9, 0.1],
  ],
])
reward = np.array([
    [
      [0, 0],
      [-1, 5],
    ],
    [
      [-2, 1],
      [1, 0],
    ],
])

env = Env(n_s, n_a, discount, init_dist, transition_dist)

# if the implementation is correct and I'm not brain dead this should output
# [0.9, 0], [1, 0], 0.45 (ignoring empty dims)
slow_epic(reward, env)

(array([[[0.9, 0. ]]]),
 array([[[1.]],
 
        [[0.]]]),
 0.45)

In [50]:
def epic_brr(reward: Reward, env: Env):
  D_s = get_state_dist(env)
  D_a = get_action_dist(env)
  S = D_s[:, None, None]
  A = D_a[None, :, None]
  S_prime = D_s[None, None, :]

  potential = (reward * A * S_prime).sum(axis=(1, 2))

  term1 = env.discount * potential[None, None, :]

  term2 = potential[:, None, None]
  
  term3 = env.discount * (reward * S * A * S_prime).sum()

  return term1, term2, term3

epic_brr(reward, env)

(array([[[0.9, 0. ]]]),
 array([[[1.]],
 
        [[0.]]]),
 0.45)

In [51]:
for _ in range(10):
  e = RandomEnv(8, 2)
  r = random_reward(e)
  s1, s2, s3 = slow_epic(r, e)
  b1, b2, b3 = epic_brr(r, e)
  assert np.isclose(s1, b1).all()
  assert np.isclose(s2, b2).all()
  assert np.isclose(s3, b3).all()

In [52]:
def epic_brrnt(reward, env):
  term1, term2, term3 = slow_epic(reward, env)
  return reward + term1 - term2 - term3

def epic_brrrr(reward, env):
  term1, term2, term3 = epic_brr(reward, env)
  return reward + term1 - term2 - term3

for _ in range(10):
  e = RandomEnv(8, 2)
  r = random_reward(e)
  assert np.isclose(epic_brrnt(r,e), epic_brrrr(r,e)).all()