In [13]:
import numpy as np
import torch
from env import Env, RandomEnv
from reward import random_reward
from _types import Reward
from utils import timed
from canon import epic_canon

In [35]:
import numpy as np
from utils import softmax

class Env():
    def __init__(
        self,
        n_s: int,
        n_a: int,
        discount: float,
        init_dist: np.ndarray,
        transition_dist: np.ndarray,
    ):
        self.n_s = n_s
        self.states = np.arange(n_s)
        self.n_a = n_a
        self.actions = np.arange(n_a)
        self.discount = discount
        self.init_dist = init_dist
        self.transition_dist = transition_dist

class RandomEnv(Env):
    def __init__(self, n_s: int = 128, n_a: int = 16, discount: int = 0.9):
        init_dist = np.ones(n_s) / n_s
        thresh = 1 if n_s < 50 else (1.5 if n_s < 100 else 1.8)
        transition_dist = np.random.randn(n_s, n_a, n_s)
        transition_dist = np.where(transition_dist > thresh,
                             transition_dist, np.zeros_like(transition_dist) - 20)
        transition_dist = softmax(transition_dist)
        super().__init__(n_s, n_a, discount, init_dist, transition_dist)

    def modified_reward_matrix(self, original_reward_matrix: np.ndarray) -> np.ndarray:
        n_s, n_a, _ = original_reward_matrix.shape
        modified_reward_matrix = np.zeros((n_s, n_a, n_s))

        for s in range(n_s):
            for a in range(n_a):
                expected_reward = np.sum(self.transition_dist[s, a] * original_reward_matrix[s, a])
                modified_reward_matrix[s, a] = expected_reward

        return modified_reward_matrix

def random_reward(env: Env) -> np.ndarray:
    r = np.random.randn(env.n_s, env.n_a, env.n_s)
    if np.random.random() > 0.8:
        thresh = 3 if env.n_s < 50 else (3.5 if env.n_s < 100 else 3.8)
        r = np.where(r > thresh, r, np.zeros_like(r))
    if np.random.random() > 0.3:
        r *= 10 * np.random.random()
    if np.random.random() > 0.7:
        r += 10 * np.random.random()
    if np.random.random() > 0.5:
        potential = np.random.randn(env.n_s)
        potential *= 10 * np.random.random()
        potential += np.random.random()
        r += env.discount * potential[None, None, :] - potential[:, None, None]
    return r


env = RandomEnv(n_s=128, n_a=16, discount=0.9)

r1 = random_reward(env)

r1_tau = env.modified_reward_matrix(r1)

r2 = random_reward(env)

r2_tau = env.modified_reward_matrix(r2)


In [36]:
def epic(r1: Reward, r2: Reward, env: Env) -> float:
  r1_can = epic_canon(r1, env)
  r2_can = epic_canon(r2, env)

  r1_norm = r1_can / np.linalg.norm(r1_can.flatten(), 2)
  r2_norm = r2_can / np.linalg.norm(r2_can.flatten(), 2)

  return np.linalg.norm((r1_norm - r2_norm).flatten(), 2)

In [37]:
d1 = epic(r1, r2, env)
d2 = epic(r1_tau, r2_tau, env)

In [40]:
d1

1.4147310858370963

In [39]:
d2

1.4618045597388316

In [43]:
import pandas as pd
results = pd.DataFrame(columns=["d1", "d2"])

for i in range(10):
    env = RandomEnv(n_s=128, n_a=16, discount=0.9)
    r1 = random_reward(env)
    r1_tau = env.modified_reward_matrix(r1)

    r2 = random_reward(env)
    r2_tau = env.modified_reward_matrix(r2)

    d1 = epic(r1, r2, env)
    d2 = epic(r1_tau, r2_tau, env)

    results.loc[i] = [d1, d2]

print(results)

#d1 is epic(r1, r2, env) and d2 is epic(r1_tau, r2_tau, env)

         d1        d2
0  1.414067  1.415951
1  1.417468  1.421807
2  1.415292  1.459411
3  1.413255  1.404478
4  1.415144  1.427270
5  1.413020  1.370182
6  1.412565  1.374363
7  1.416404  1.401488
8  1.414125  1.429347
9  1.414752  1.387329
