In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#!pip install gym

In [3]:
from frozenlake_helper import get_frozenlake_env, policy_1, policy_2, policy_3, policy_4, LEFT, RIGHT, UP, DOWN
import numpy as np

In [4]:
policies = [policy_1, policy_2, policy_3, policy_4]

In [5]:
number_of_states = 12

In [6]:
def get_discounted_return(r, gamma=0.9):
    # Por si es una lista
    r = np.array(r, dtype=float)
    """Take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r[0]

def run_episode(env, policy, start_pos, gamma=1.0):
    obs = env.reset(start_pos)
    done = False
    rewards = []
    while not done:
        action = policy[obs]
        obs, reward, done, info = env.step(action)
        rewards.append(reward)
    return get_discounted_return(rewards, gamma)

def get_expected_return(env, policy, N=5000, start_pos=0, gamma=1.0):
    rewards = []
    
    for i in range(N):
        reward = run_episode(env, policy, start_pos, gamma)
        rewards.append(reward)

    rewards = np.array(rewards, dtype=float)
    return rewards, np.mean(rewards), np.std(rewards)

# Ejercicio 1: Muestreo en entorno aleatorio

In [7]:
step_penalty = 0
gamma = 1.0
is_slippery = True
env = get_frozenlake_env(is_slippery, step_penalty=step_penalty)
policy = policy_1

In [8]:
obs = env.reset(0)
action = policy[1]
obs, reward, done, info = env.step(action)

In [9]:
info

{'prob': 0.3333333333333333}

### Armar una función que devuelva la estimación de la V(s)
Recibe:
- El entorno (env)
- La política (policy)
- La cantidad de episodios usados para la estimación

Devuelve:
- numpy array de longitud 12 con los "value" donde la posición indica el estado 

In [10]:
def estimate_V_sampling(env, policy, N=10_000, gamma=1.0):
    Vs_sample = np.zeros(number_of_states)
    for start_pos in range(number_of_states):
        if start_pos in policy:
            _, r_mean, r_std = get_expected_return(env, policy, N, start_pos, gamma)
            Vs_sample[start_pos] = r_mean
    return Vs_sample

In [11]:
N = 1_000
for i, policy in enumerate(policies):
    Vs_sample_policy_1 = estimate_V_sampling(env, policy, N=N, gamma=gamma)
    print('V(s) para policy', i + 1)
    print(Vs_sample_policy_1.reshape(3, 4))
    print()

V(s) para policy 1
[[0.807 1.    1.    1.   ]
 [0.586 1.    1.    1.   ]
 [0.    1.    1.    0.   ]]

V(s) para policy 2
[[0.545 0.652 0.785 0.855]
 [0.408 0.675 0.862 0.922]
 [0.    0.795 0.928 0.   ]]

V(s) para policy 3
[[0.604 0.724 0.83  0.855]
 [0.394 0.678 0.798 0.875]
 [0.    0.774 0.904 0.   ]]

V(s) para policy 4
[[0.    0.    0.    0.204]
 [0.    0.    0.    0.397]
 [0.    0.    0.    0.   ]]



# Ejercicio 2: Armar modelos de entorno y recompenza

### Para el caso del entorno esto sería un diccionario: 
- con keys de todos los estados posibles (de 0 a 11)
- para cada estado un diccionario con keys de las acciones posibles (LEFT, RIGHT, UP, DOWN)
- para cada estado y acción un diccionario que indique el listado de las proximas acciones con sus probabilidades

### Para el caso del modelo de recompenza sería: 

Igual al anterior solo que el ultimo diccionario contiene los proximos estados con la recompenza de cada uno.

En este caso para simplificar el código se guarda:
- count: cantidad de veces que entro en ese estado para calcular el reward
- total_reward: la suma de los rewards
- reward: El que nos interesa (total_reward/count)

In [12]:
action_to_str = {
    LEFT: 'LEFT',
    RIGHT: 'RIGHT',
    UP: 'UP',
    DOWN: 'DOWN'
}
str_to_action = {
    'LEFT': LEFT,
    'RIGHT': RIGHT,
    'UP': UP,
    'DOWN': DOWN
}

In [15]:
actions = [LEFT, RIGHT, UP, DOWN]

N = 10000
transition_model = {}
reward_model = {}

for start_pos in range(number_of_states):
    # Iteración en todos los estados
    if start_pos not in transition_model:
        # Inicializo diccionario del estado
        transition_model[start_pos] = {}
        reward_model[start_pos] = {}
    for action in actions:
        # Iteración en todas las acciones
        action_str = action_to_str[action]
        if action not in transition_model[start_pos]:
            # inicializo diccinario de la accion
            transition_model[start_pos][action_str] = {}
            reward_model[start_pos][action_str] = {}
        for n in range(N):
            # Notar que no corro todo el episodio sino que solo me interesa la próxima acción
            env.reset(start_pos)
            obs, reward, done, info = env.step(action)

            if obs not in transition_model[start_pos][action_str]:
                # inicializo diccionario de proximo estado
                transition_model[start_pos][action_str][obs] = 0
                reward_model[start_pos][action_str][obs] = {}
                reward_model[start_pos][action_str][obs]['total_reward'] = 0
                reward_model[start_pos][action_str][obs]['count'] = 0
                reward_model[start_pos][action_str][obs]['reward'] = 0

            # Implementar 
            transition_model[start_pos][action_str][obs] += 1 
            reward_model[start_pos][action_str][obs]['total_reward'] += reward
            reward_model[start_pos][action_str][obs]['count'] += 1
            reward_model[start_pos][action_str][obs]['reward'] = reward_model[start_pos][action_str][obs]['total_reward']/reward_model[start_pos][action_str][obs]['count']
        
# Normalización de modelo
for state, actions in transition_model.items():
    for action, next_state_count in actions.items():
        total_count = 0
        for next_state, count in next_state_count.items():
            total_count = total_count + count
        for next_state, count in next_state_count.items():
            next_state_count[next_state] = count/total_count

### Depende el modelo del entorno de la policy?

In [16]:
transition_model[0]

{'LEFT': {0: 0.6644, 4: 0.3356},
 'RIGHT': {1: 0.3333, 4: 0.3319, 0: 0.3348},
 'UP': {0: 0.662, 1: 0.338},
 'DOWN': {1: 0.3345, 4: 0.3365, 0: 0.329}}

In [17]:
reward_model[10]

{'LEFT': {9: {'total_reward': 0, 'count': 3429, 'reward': 0.0},
  10: {'total_reward': 0, 'count': 3252, 'reward': 0.0},
  6: {'total_reward': 0, 'count': 3319, 'reward': 0.0}},
 'RIGHT': {6: {'total_reward': 0, 'count': 3274, 'reward': 0.0},
  11: {'total_reward': 3393.0, 'count': 3393, 'reward': 1.0},
  10: {'total_reward': 0, 'count': 3333, 'reward': 0.0}},
 'UP': {6: {'total_reward': 0, 'count': 3321, 'reward': 0.0},
  11: {'total_reward': 3251.0, 'count': 3251, 'reward': 1.0},
  9: {'total_reward': 0, 'count': 3428, 'reward': 0.0}},
 'DOWN': {10: {'total_reward': 0, 'count': 3313, 'reward': 0.0},
  9: {'total_reward': 0, 'count': 3264, 'reward': 0.0},
  11: {'total_reward': 3423.0, 'count': 3423, 'reward': 1.0}}}

In [95]:
reward_model[7]

{'LEFT': {6: {'total_reward': 0, 'count': 3399, 'reward': 0.0},
  3: {'total_reward': 0, 'count': 3271, 'reward': 0.0},
  11: {'total_reward': 3330.0, 'count': 3330, 'reward': 1.0}},
 'RIGHT': {11: {'total_reward': 3289.0, 'count': 3289, 'reward': 1.0},
  7: {'total_reward': 0, 'count': 3344, 'reward': 0.0},
  3: {'total_reward': 0, 'count': 3367, 'reward': 0.0}},
 'UP': {6: {'total_reward': 0, 'count': 3286, 'reward': 0.0},
  7: {'total_reward': 0, 'count': 3374, 'reward': 0.0},
  3: {'total_reward': 0, 'count': 3340, 'reward': 0.0}},
 'DOWN': {7: {'total_reward': 0, 'count': 3348, 'reward': 0.0},
  6: {'total_reward': 0, 'count': 3387, 'reward': 0.0},
  11: {'total_reward': 3265.0, 'count': 3265, 'reward': 1.0}}}

# Ejercicio 3: Value iteration

In [121]:
def estimate_V_by_value_iteration(policy, transition_model, reward_model, N=500, number_of_states=12):
    Vs = np.zeros(number_of_states)
    for i in range(N):
        oldV = Vs.copy()
        for s, v in enumerate(Vs):
            if s in policy:
                action = action_to_str[policy[s]]
                avg_reward = 0
                reward = 0
                
                for next_s, prob in transition_model[s][action].items():
                    reward += prob * (oldV[next_s] + reward_model[s][action][next_s]['reward'])
                
                avg_reward = reward
                Vs[s] = avg_reward
    return Vs

In [None]:
#def value_iteration(S, A, P, R):
#    V = {s: 0 for s in S}
    
#    while True:
#        oldV = V.copy()
        
#        for s in S:
#            Q = {}
#            for a in A:
#                Q[a] = R(s, a) + sum(P(s_next, s, a) * oldV[s_next] for s_next in S)
                
#            V[s] = max(Q.values())
            
#        if all(oldV[s] == V[s] for s in S):
#            break

In [130]:
N = 100
policy = policy_1
Vs = estimate_V_by_value_iteration(policy, transition_model, reward_model, N=N)

In [131]:
Vs.reshape(3, 4)

array([[0.8022516 , 0.99999786, 0.99999842, 0.9999986 ],
       [0.6036721 , 0.99999814, 0.99999886, 0.99999931],
       [0.        , 0.99999842, 0.99999931, 0.        ]])

In [120]:
N = 200
Vs_sample_policy = estimate_V_sampling(env, policy, N=N, gamma=gamma)
Vs_sample_policy.reshape(3, 4)

array([[0.79, 1.  , 1.  , 1.  ],
       [0.55, 1.  , 1.  , 1.  ],
       [0.  , 1.  , 1.  , 0.  ]])