In [1]:
import gymnasium as gym
import numpy as np
import numpy.random

In [2]:
EPSILON = .3
ALPHA = .7
GAMMA = .9

In [3]:
Qs = np.zeros((48,4))

In [4]:
state_counts = dict()
for i in range(48):
    state_counts[i] = 0
env = gym.make('CliffWalking-v0')
observation, info = env.reset()
for _ in range(100000):
    old_observation = observation
    if np.random.random() < EPSILON:
        action = env.action_space.sample()  # agent policy that uses the observation and info
    else:
        action = np.argmax(Qs[observation,:])
    observation, reward, terminated, truncated, info = env.step(action)
    Qs[old_observation, action] = ALPHA * (reward + GAMMA*np.max(Qs[observation,:])) + (1-ALPHA)*Qs[old_observation,action]
    
    state_counts[observation] += 1

    if terminated or truncated:
        observation, info = env.reset()

env.close()

In [5]:
state_counts

{0: 167,
 1: 232,
 2: 321,
 3: 435,
 4: 565,
 5: 710,
 6: 839,
 7: 949,
 8: 1039,
 9: 1132,
 10: 1204,
 11: 1362,
 12: 936,
 13: 1390,
 14: 1776,
 15: 1976,
 16: 2082,
 17: 2149,
 18: 2155,
 19: 2142,
 20: 2101,
 21: 2019,
 22: 2108,
 23: 3256,
 24: 9010,
 25: 7519,
 26: 6399,
 27: 5508,
 28: 4805,
 29: 4129,
 30: 3654,
 31: 3260,
 32: 2882,
 33: 2580,
 34: 2582,
 35: 4844,
 36: 6035,
 37: 0,
 38: 0,
 39: 0,
 40: 0,
 41: 0,
 42: 0,
 43: 0,
 44: 0,
 45: 0,
 46: 0,
 47: 3748}

In [6]:
Qs

array([[  -7.94042418,   -7.71232075,   -7.71232075,   -7.94099202],
       [  -7.71231822,   -7.45813417,   -7.45813417,   -7.9405176 ],
       [  -7.45813417,   -7.17570464,   -7.17570464,   -7.71232056],
       [  -7.17570464,   -6.86189404,   -6.86189404,   -7.45813417],
       [  -6.86189404,   -6.5132156 ,   -6.5132156 ,   -7.17570464],
       [  -6.5132156 ,   -6.12579511,   -6.12579511,   -6.86189404],
       [  -6.12579511,   -5.6953279 ,   -5.6953279 ,   -6.5132156 ],
       [  -5.6953279 ,   -5.217031  ,   -5.217031  ,   -6.12579511],
       [  -5.217031  ,   -4.68559   ,   -4.68559   ,   -5.6953279 ],
       [  -4.68559   ,   -4.0951    ,   -4.0951    ,   -5.217031  ],
       [  -4.0951    ,   -3.439     ,   -3.439     ,   -4.68559   ],
       [  -3.439     ,   -3.439     ,   -2.71      ,   -4.0951    ],
       [  -7.94108868,   -7.45813417,   -7.45813417,   -7.71232075],
       [  -7.71232075,   -7.17570464,   -7.17570464,   -7.71232075],
       [  -7.45813417,   -6.861894

In [7]:
np.save('cliff_walking_qs.npy', Qs)

In [8]:
env = gym.make('CliffWalking-v0', render_mode='human')
observation, info = env.reset()
for _ in range(15):
    old_observation = observation
    action = np.argmax(Qs[observation,:])
    observation, reward, terminated, truncated, info = env.step(action)
    #Qs[old_observation, action] = ALPHA * (reward + GAMMA*np.max(Qs[observation,:])) + (1-ALPHA)*Qs[old_observation,action]
    
    state_counts[observation] += 1

    if terminated or truncated:
        observation, info = env.reset()

env.close()