In [None]:
!pip install git+https://github.com/mimoralea/gym-walk#egg=gym-walk





Collecting gym-walk
  Cloning https://github.com/mimoralea/gym-walk to /tmp/pip-install-p7ig9jct/gym-walk_84abde56bd5546229270c41e3ab6b132
  Running command git clone --filter=blob:none --quiet https://github.com/mimoralea/gym-walk /tmp/pip-install-p7ig9jct/gym-walk_84abde56bd5546229270c41e3ab6b132
  Resolved https://github.com/mimoralea/gym-walk to commit b915b94cf2ad16f8833a1ad92ea94e88159279f5
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import gym, gym_walk
import numpy as np

import random
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)
np.set_printoptions(suppress=True)
random.seed(123); np.random.seed(123)

In [None]:
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='Policy:'):
    print(title)
    arrs = {k:v for k,v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")




In [None]:
def print_state_value_function(V, P, n_cols=4, prec=3, title='State-value function:'):
    print(title)
    for s in range(len(P)):
        v = V[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), '{}'.format(np.round(v, prec)).rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")



In [None]:

def probability_success(env, pi, goal_state, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123) ; env.seed(123)
    results = []
    for _ in range(n_episodes):
        state, done, steps = env.reset(), False, 0
        while not done and steps < max_steps:
            state, _, done, h = env.step(pi(state))
            steps += 1
        results.append(state == goal_state)
    return np.sum(results)/len(results)

In [None]:
def mean_return(env, pi, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123) ; env.seed(123)
    results = []
    for _ in range(n_episodes):
        state, done, steps = env.reset(), False, 0
        results.append(0.0)
        while not done and steps < max_steps:
            state, reward, done, _ = env.step(pi(state))
            results[-1] += reward
            steps += 1
    return np.mean(results)

In [None]:

env = gym.make('FrozenLake-v1')
P = env.env.P
init_state = env.reset()
goal_state = 15
#LEFT, RIGHT = range(2)

In [None]:
P

{0: {0: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  2: [(0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)],
  3: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 1: {0: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False)],
  2: [(0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  3: [(0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 2:

In [None]:
def decay_schedule(init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):
    decay_steps = int(max_steps * decay_ratio)
    remaining_steps = max_steps - decay_steps
    values = []
    for step in range(max_steps):
        if step < decay_steps:
            values.append(init_value)
        else:
            frac = (step - decay_steps) / max(1, remaining_steps)
            log_scale = np.logspace(log_start, 0, num=remaining_steps, base=log_base)
            decayed = (init_value - min_value) * (1 - frac) + min_value
            values.append(max(min_value, decayed))
    return np.array(values)

In [None]:
from itertools import count
def generate_trajectory(select_action, Q, epsilon, env, max_steps=200):
    state, done = env.reset(), False
    trajectory = []
    for t in range(max_steps):
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = select_action(Q[state])
        next_state, reward, done, _ = env.step(action)
        trajectory.append((state, action, reward))
        state = next_state
        if done:
            break
    return np.array(trajectory, object)

In [None]:
def mc_control(env, gamma=1.0,
               init_alpha=0.5, min_alpha=0.01, alpha_decay_ratio=0.5,
               init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9,
               n_episodes=3000, max_steps=200, first_visit=True):

    nS, nA = env.observation_space.n, env.action_space.n
    Q = np.zeros((nS, nA))
    returns_count = np.zeros((nS, nA))

    def select_action(Qs): return np.argmax(Qs)

    alpha_schedule = decay_schedule(init_alpha, min_alpha, alpha_decay_ratio, n_episodes)
    epsilon_schedule = decay_schedule(init_epsilon, min_epsilon, epsilon_decay_ratio, n_episodes)

    for i in range(n_episodes):
        epsilon, alpha = epsilon_schedule[i], alpha_schedule[i]
        trajectory = generate_trajectory(select_action, Q, epsilon, env, max_steps)
        G, visited = 0.0, set()
        for t in reversed(range(len(trajectory))):
            s, a, r = trajectory[t]
            G = gamma * G + r
            if first_visit and (s, a) in visited:
                continue
            returns_count[s, a] += 1
            Q[s, a] += alpha * (G - Q[s, a])
            visited.add((s, a))

    V = np.max(Q, axis=1)
    pi = lambda s: np.argmax(Q[s])
    return Q, V, pi

In [None]:
optimal_Q, optimal_V, optimal_pi = mc_control(env, n_episodes=3000)

In [None]:
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='Policy:'):
    print(title)
    arrs = {k:v for k,v in enumerate(action_symbols)}
    for s in range(len(P)):
        # Call the policy function with the state s
        a = pi(s)
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [None]:
print('Name: NIRAUNJANA GAYATHRI G RRegister Number: 212222230096')
print_state_value_function(optimal_Q, P, n_cols=4, prec=2, title='Action-value function:')
print_state_value_function(optimal_V, P, n_cols=4, prec=2, title='State-value function:')
print_policy(optimal_pi, P)

Name: NIRAUNJANA GAYATHRI G RRegister Number: 212222230096
Action-value function:
| 00 [0.   0.06 0.   0.  ] | 01 [0.01 0.01 0.03 0.01] | 02 [0.03 0.06 0.   0.  ] | 03 [0. 0. 0. 0.] |
| 04 [0.06 0.   0.   0.  ] |           | 06 [0.08 0.05 0.01 0.  ] |           |
| 08 [0.   0.02 0.   0.06] | 09 [0.06 0.05 0.03 0.08] | 10 [0.17 0.1  0.11 0.  ] |           |
|           | 13 [0.03 0.17 0.14 0.14] | 14 [0.12 0.38 0.42 0.32] |           |
State-value function:
| 00   0.06 | 01   0.03 | 02   0.06 | 03    0.0 |
| 04   0.06 |           | 06   0.08 |           |
| 08   0.06 | 09   0.08 | 10   0.17 |           |
|           | 13   0.17 | 14   0.42 |           |
Policy:
| 00      v | 01      > | 02      v | 03      v |
| 04      < |           | 06      < |           |
| 08      ^ | 09      ^ | 10      < |           |
|           | 13      v | 14      > |           |


In [None]:
# Find the probability of success and the mean return of you your policy
print('Name: NIRAUNJANA GAYATHRI G RRegister Number: 212222230096')
print('Reaches goal {:.2f}%. Obtains an average undiscounted return of {:.4f}.'.format(
    probability_success(env, optimal_pi, goal_state=goal_state)*100,
    mean_return(env, optimal_pi)))

Name: NIRAUNJANA GAYATHRI G RRegister Number: 212222230096
Reaches goal 7.00%. Obtains an average undiscounted return of 0.0700.
