<a href="https://colab.research.google.com/github/manojvenaram/monte-carlo-control/blob/main/Exp5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install git+https://github.com/mimoralea/gym-walk#egg=gym-walk

Collecting gym-walk
  Cloning https://github.com/mimoralea/gym-walk to /tmp/pip-install-c_9a6rfk/gym-walk_ea2994e7fb6e4371afdaf19977de2fb5
  Running command git clone --filter=blob:none --quiet https://github.com/mimoralea/gym-walk /tmp/pip-install-c_9a6rfk/gym-walk_ea2994e7fb6e4371afdaf19977de2fb5
  Resolved https://github.com/mimoralea/gym-walk to commit 5999016267d6de2f5a63307fb00dfd63de319ac1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gym-walk
  Building wheel for gym-walk (setup.py) ... [?25l[?25hdone
  Created wheel for gym-walk: filename=gym_walk-0.0.2-py3-none-any.whl size=4053 sha256=01e513276debdfa6e720e952d4623b4abb95efc83901fdfd80fba26e5dcd28c8
  Stored in directory: /tmp/pip-ephem-wheel-cache-r3dqybsv/wheels/24/fe/c4/0cbc7511d29265bad7e28a09311db3f87f0cafba74af54d530
Successfully built gym-walk
Installing collected packages: gym-walk
Successfully installed gym-walk-0.0.2


In [2]:
import warnings ; warnings.filterwarnings('ignore')

import gym, gym_walk
import numpy as np

import random
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)
np.set_printoptions(suppress=True)
random.seed(123); np.random.seed(123)
import matplotlib.pyplot as plt


In [3]:
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='Policy:'):
    print(title)
    arrs = {k:v for k,v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi(s)
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [4]:
def print_state_value_function(V, P, n_cols=4, prec=4, title='State-value function:'):
    print(title)
    for s in range(len(P)):
        v = V[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), '{}'.format(np.round(v, prec)).rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [5]:
def probability_success(env, pi, goal_state, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123) ; env.seed(123)
    results = []
    for _ in range(n_episodes):
        state, done, steps = env.reset(), False, 0
        while not done and steps < max_steps:
            state, _, done, h = env.step(pi(state))
            steps += 1
        results.append(state == goal_state)
    return np.sum(results)/len(results)

In [6]:
def mean_return(env, pi, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123) ; env.seed(123)
    results = []
    for _ in range(n_episodes):
        state, done, steps = env.reset(), False, 0
        results.append(0.0)
        while not done and steps < max_steps:
            state, reward, done, _ = env.step(pi(state))
            results[-1] += reward
            steps += 1
    return np.mean(results)

In [7]:
env = gym.make('FrozenLake-v1')
P = env.env.P
init_state = env.reset()
goal_state = 15
LEFT, RIGHT = range(2)

In [8]:
def decay_schedule(init_value,min_value,decay_ratio,max_steps,log_start=-2,log_base=10):
  decay_steps=int(max_steps*decay_ratio)
  rem_steps=max_steps=max_steps-decay_steps
  values=np.logspace(log_start,0,decay_steps,base=log_base,endpoint=True)[::-1]
  values=(values-values.min())/(values.max()-values.min())
  values=(init_value-min_value)*values+min_value
  values=np.pad(values,(0,rem_steps),'edge')
  return values

In [9]:
from itertools import count
def generate_trajectory(select_action,Q,epsilon,env,max_steps=20):
  done,trajectory=False,[]
  while not done:
    state=env.reset()
    for t in count():
      action=select_action(state,Q,epsilon)
      next_state,reward,done,_=env.step(action)
      experience=(state,action,reward,next_state,done)
      trajectory.append(experience)
      if done:
        break
      if t>=max_steps-1:
        trajectory=[]
        break
      state=next_state
  return np.array(trajectory,object)

In [10]:
from tqdm import tqdm
def mc_control(env, gamma=1.0, init_alpha=0.5, min_alpha=0.01, alpha_decay_ratio=0.5,
               init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9,
               n_episodes=150000, max_steps=200, first_visit=True):
  nS, nA = env.observation_space.n, env.action_space.n
  discounts=np.logspace(0,max_steps,num=max_steps,base=gamma,endpoint=False)
  alphas=decay_schedule(init_alpha,min_alpha,alpha_decay_ratio,n_episodes)
  epsilons=decay_schedule(init_epsilon,min_epsilon,epsilon_decay_ratio,n_episodes)
  pi_track=[]
  Q = np.zeros((nS, nA), dtype=np.float64)
  Q_track = np.zeros((n_episodes, nS, nA), dtype=np.float64)
  select_action=lambda state,Q,epsilon:\
    np.argmax(Q[state])\
    if np.random.random()>epsilon\
    else np.random.randint(len(Q[state]))
  for e in tqdm(range(n_episodes),leave=False):
    trajectory=generate_trajectory(select_action,Q,epsilons[e],env,max_steps)
    visited=np.zeros((nS,nA),dtype=bool)
    for t,(state,action,reward,_,_) in enumerate(trajectory):
      if visited[state][action] and first_visit:
        continue
      visited[state][action]=True
      n_steps=len(trajectory[t:])
      G=np.sum(discounts[:n_steps]*trajectory[t:,2])
      Q[state][action]=Q[state][action]+alphas[e]*(G-Q[state][action])
    Q_track[e]=Q
    pi_track.append(np.argmax(Q,axis=1))
  V=np.max(Q,axis=1)
  pi=lambda s: {s:a for s,a in enumerate(np.argmax(Q,axis=1))}[s]
  return Q,V,pi,Q_track,pi_track


In [11]:
def results(env, optimal_pi, goal_state, seed=123):
    success_rate = probability_success(env, optimal_pi, goal_state=goal_state)
    avg_return = mean_return(env, optimal_pi)

    print(f'Reaches goal {success_rate:.2%}. Obtains an average undiscounted return of: {avg_return:.4f}.')

goal_state = 15

In [12]:
Q, V, pi,_,_ = mc_control (env)



In [13]:
results(env, pi, goal_state=goal_state)

Reaches goal 71.00%. Obtains an average undiscounted return of: 0.7100.


In [14]:
print_policy(pi, P)

Policy:
| 00      < | 01      ^ | 02      ^ | 03      ^ |
| 04      < |           | 06      > |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |


In [15]:
print_state_value_function(Q, P, n_cols=4, prec=4, title='Action-value function:')

Action-value function:
| 00 [0.4125 0.3321 0.3104 0.3407] | 01 [0.195  0.2064 0.1638 0.3288] | 02 [0.1744 0.1781 0.168  0.2455] | 03 [0.1299 0.1305 0.0718 0.2279] |
| 04 [0.4163 0.2472 0.2692 0.2467] |           | 06 [0.1743 0.1838 0.2033 0.0955] |           |
| 08 [0.3025 0.2875 0.2444 0.4565] | 09 [0.3107 0.5219 0.3803 0.3181] | 10 [0.4971 0.4139 0.3682 0.2357] |           |
|           | 13 [0.36   0.5086 0.637  0.4199] | 14 [0.6384 0.7766 0.7339 0.6524] |           |


In [16]:
print_state_value_function(V, P, n_cols=4, prec=4, title='State-value function:')

State-value function:
| 00 0.4125 | 01 0.3288 | 02 0.2455 | 03 0.2279 |
| 04 0.4163 |           | 06 0.2033 |           |
| 08 0.4565 | 09 0.5219 | 10 0.4971 |           |
|           | 13  0.637 | 14 0.7766 |           |
