In [12]:
debug = True

In [2]:
from custum_frozen_lake_env import CustumFrozenLakeEnv
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [13]:
env = CustumFrozenLakeEnv(map_name="8x8")

obs_space = env.observation_space
n_state = obs_space.n
print('Observation space')
print("Total {} states".format(n_state))

act_space = env.action_space
n_act = act_space.n
print('Action space')
print("Total {} actions".format(n_act))

Observation space
Total 64 states
Action space
Total 4 actions


In [35]:
def logsumexp(x, scale = 1):
    x = np.array(x)/scale
    max_x = np.max(x,axis=1,keepdims=True)
    lse_x = max_x[:,0] + np.log(np.exp(x-max_x).sum()) # Numerical Stability
    lse_x = scale*lse_x
    return lse_x

In [43]:
def softmax(x, scale = 1):
    x = np.array(x)/scale
    max_x = np.max(x,axis=1,keepdims=True)
    e_x = np.exp(x - max_x) # Numerical Stability
    p = e_x/np.sum(e_x,axis=1,keepdims=True)
    p = p/np.sum(p,axis=1,keepdims=True)
    return p

###### soft_value_iteration
 > - 그냥 q-learning과 같고 policy를 구할때 q-value를 입력으로하는 softmax를 쓴것만 다르다.
- 그리고 q-function으로 value-function구할때 numerical stability를 위한 trick이 들어갔다.(logsumexp)

In [52]:
def soft_value_iteration(env,rewards=None,gamma=0.99,epsilon=1e-6,scale=1e-5):
    obs_space = env.observation_space
    n_state = obs_space.n
    act_space = env.action_space
    n_act = act_space.n
    
        
    P = np.zeros((n_state,n_act,n_state))
    r = np.zeros((n_state,n_act,n_state))
    
    for s in env.unwrapped.P.keys(): # For all states s, update v(s)
        for a in env.unwrapped.P[s].keys(): # For all actions a
            for prob, next_s, reward, done in env.unwrapped.P[s][a]: # For all possible transitions (s,a,s')
                P[s][a][next_s]=prob
                if rewards is None:
                    r[s][a][next_s]=reward
                else:
                    r[s][a][next_s]=rewards[s][a][next_s]
        
    value = np.random.uniform(size=(n_state,))
    
    if debug:
        print('Obs space:', obs_space, 'N_state:', n_state)
        print('Act_space:', act_space, 'N_act:', n_act)
        print("Transition prob P's shape:", P.shape)
        print("Reward func r's shape:", r.shape)
        print("Value function's shape:", value.shape)    
            
    while True:
        q = np.sum((r + gamma * np.tile(value[np.newaxis,np.newaxis,:],reps=(n_state,n_act,1)))*P,axis=2)
        v_prime = logsumexp(q, scale=scale)
        dist = np.max(np.abs(value-v_prime))
        value = v_prime
        if dist < epsilon:
            break
            
    policy = softmax(q, scale=scale)
    return policy, value

In [53]:
env = CustumFrozenLakeEnv(map_name="8x8")

obs_space = env.observation_space
n_state = obs_space.n
act_space = env.action_space
n_act = act_space.n

soft_value_iteration(env)


Obs space: Discrete(64) N_state: 64
Act_space: Discrete(4) N_act: 4
Transition prob P's shape: (64, 4, 64)
Reward func r's shape: (64, 4, 64)
Value function's shape: (64,)


(array([[0.00000000e+000, 5.00000000e-001, 5.00000000e-001,
         0.00000000e+000],
        [0.00000000e+000, 8.56593352e-193, 1.00000000e+000,
         3.72155633e-109],
        [0.00000000e+000, 1.18885622e-254, 1.00000000e+000,
         3.11647434e-137],
        [0.00000000e+000, 3.97971473e-296, 1.00000000e+000,
         1.85911350e-160],
        [0.00000000e+000, 2.47032823e-323, 1.00000000e+000,
         2.33207439e-189],
        [0.00000000e+000, 0.00000000e+000, 1.00000000e+000,
         9.76535595e-250],
        [0.00000000e+000, 3.99441317e-289, 1.00000000e+000,
         0.00000000e+000],
        [5.00000000e-001, 5.00000000e-001, 0.00000000e+000,
         0.00000000e+000],
        [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         1.00000000e+000],
        [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         1.00000000e+000],
        [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         1.00000000e+000],
        [0.00000000e+000, 0.00000000e+000, 

###### get_visitation
> - visitation
 - state visitation: $\rho_{\pi_\theta}(s) = \sum_t^\infty \gamma^t P_{\pi_\theta}(S_t = s)$
 - action visitation: $\rho_{\pi_\theta}(s, a) = \sum_t^\infty \gamma^t P_{\pi_\theta}(S_t = s, A_t = a)$
- feature expectation
 $$\mu(\pi) = \sum_s \rho_\pi(s) \phi(s)$$

In [54]:
def get_visitation(env, policy, gamma=0.99, epsilon=1e-6):
    obs_space = env.observation_space
    n_state = obs_space.n
    act_space = env.action_space
    n_act = act_space.n
    
    P = np.zeros((n_state,n_act,n_state))
    for s in env.unwrapped.P.keys(): # For all states s, update v(s)
        for a in env.unwrapped.P[s].keys(): # For all actions a
            for prob, next_s, reward, done in env.unwrapped.P[s][a]: # For all possible transitions (s,a,s')
                P[s][a][next_s]=prob
    
    d = np.zeros((n_state,))  # vector with n_state length
    d[0] = 1  # 
    
    mu_s = np.zeros((n_state,))
    mu_s[0] = 1
    while True:
        mu_sa = 

$$\gamma  \sum_a \sum_s P(s' \mid s, a) \mu(s, a)$$