In [3]:
from pyvirtualdisplay import Display
import gym

Purpose of the below code:

If you are using a headless server (i.e., without a screen), such as a
virtual machine on the cloud, rendering will fail. The only way to
avoid this is to use a fake X server such as Xvfb or Xdummy. For
example, you can install Xvfb (apt install xvfb on Ubuntu or
Debian) and start Python using the following command: xvfb-run
-s "-screen 0 1400x900x24" python3. Alternatively, install Xvfb
and the pyvirtualdisplay library (which wraps Xvfb) and run
pyvirtualdisplay.Display(visible=0, size=(1400,
900)).start() at the beginning of your program.

In [4]:
# Start the virtual display (invisible, but it will be used for rendering)
display = Display(visible=0, size=(1400, 900))  # invisible to you
display.start()

<pyvirtualdisplay.display.Display at 0x115a0a920>

In [5]:
env = gym.make('CartPole-v1', render_mode='rgb_array')

In [6]:
obs = env.reset() # Initialize the environment and get the first observation
obs

(array([-0.03369221,  0.02581028, -0.04109971, -0.02385363], dtype=float32),
 {})

In [20]:
type(obs)  # Check the type of the observation

tuple

These floats represent the cart’s horizontal position (0.0 = center), its velocity (positive means right), 
the angle of the pole (0.0 = vertical), and its angular velocity (positive means clockwise).

In [7]:
gym.envs.registry.keys() # list all available environments

dict_keys(['CartPole-v0', 'CartPole-v1', 'MountainCar-v0', 'MountainCarContinuous-v0', 'Pendulum-v1', 'Acrobot-v1', 'LunarLander-v2', 'LunarLanderContinuous-v2', 'BipedalWalker-v3', 'BipedalWalkerHardcore-v3', 'CarRacing-v2', 'Blackjack-v1', 'FrozenLake-v1', 'FrozenLake8x8-v1', 'CliffWalking-v0', 'Taxi-v3', 'Reacher-v2', 'Reacher-v4', 'Pusher-v2', 'Pusher-v4', 'InvertedPendulum-v2', 'InvertedPendulum-v4', 'InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v4', 'HalfCheetah-v2', 'HalfCheetah-v3', 'HalfCheetah-v4', 'Hopper-v2', 'Hopper-v3', 'Hopper-v4', 'Swimmer-v2', 'Swimmer-v3', 'Swimmer-v4', 'Walker2d-v2', 'Walker2d-v3', 'Walker2d-v4', 'Ant-v2', 'Ant-v3', 'Ant-v4', 'Humanoid-v2', 'Humanoid-v3', 'Humanoid-v4', 'HumanoidStandup-v2', 'HumanoidStandup-v4'])

In [8]:
img = env.render()
img.shape

(400, 600, 3)

In [9]:
env.action_space # Lists the posible actions 

Discrete(2)

Discrete(2) means that the possible actions are integers 0 and 1, which represent
accelerating left (0) or right (1).

In [10]:
action = 1 # Leaning towards the right

In [14]:
next_state_raw, reward, done, truncated, _ = env.step(action)

In [15]:
next_state_raw

array([-0.02874607,  0.41718522, -0.0481611 , -0.6347144 ], dtype=float32)

In [16]:
reward

1.0

In [17]:
done

False

### A simple policy run

Let’s hardcode a simple policy that accelerates left when the pole is leaning toward the
left and accelerates right when the pole is leaning toward the right.

Thes 4 obs represent the cart’s horizontal position (0.0 = center), its velocity (positive means right), 
the angle of the pole (0.0 = vertical), and its angular velocity (positive means clockwise).

In [18]:
def basic_policy(obs):
    angle = obs[2]
    return 0 if angle < 0 else 1

In [25]:
obs

array([ 0.02533285, -0.21524055, -0.04011324,  0.29033074], dtype=float32)

In [28]:
totals = []
for episode in range(500):
    episode_rewards = 0
    obs, info = env.reset()
    print(obs)
    for step in range(200):
        action = basic_policy(obs)
        obs, reward, done, truncated, info = env.step(action)
        episode_rewards += reward
        if done:
            break
    totals.append(episode_rewards)

[-0.01196127 -0.02036527  0.02278646  0.00482107]
[ 0.0496597  -0.03018816 -0.0318079   0.03115999]
[-0.01516894 -0.02157664  0.03288854 -0.02986083]
[ 0.02131459  0.03253981 -0.01978048 -0.00207757]
[ 0.03565449 -0.00705374 -0.04852698  0.01210338]
[-0.03340174 -0.0155145  -0.01529141 -0.03471439]
[ 0.03986988 -0.00522272  0.0187891   0.0459144 ]
[-0.03891626 -0.04268832  0.0367378  -0.00108203]
[ 0.00343346 -0.01021547 -0.00467798  0.0109698 ]
[-0.03675235 -0.01617947  0.02659632  0.02010459]
[ 0.01746624  0.024594   -0.02196151  0.03497582]
[-0.00398488  0.01432834  0.0138099  -0.03185057]
[-2.4373425e-02  5.0963240e-04  1.2593530e-02 -4.0183095e-05]
[ 0.00548184 -0.00487949 -0.02579877  0.01395575]
[ 0.03198279 -0.01414612  0.02359838 -0.04905053]
[0.04072368 0.01629608 0.01596993 0.04371798]
[-0.04186226  0.04373675 -0.04588257  0.01030541]
[ 0.04872276 -0.02848777 -0.0063151   0.0290304 ]
[ 0.00363589  0.02662113  0.03644596 -0.0173772 ]
[ 0.03114117 -0.00324332  0.00637732  0.02

In [29]:
import numpy as np
np.mean(totals), np.std(totals), np.min(totals), np.max(totals)

(42.596, 9.34862471168888, 24.0, 70.0)

### Using Neural Network to define a policy

In [30]:
import tensorflow as tf
from tensorflow import keras

In [34]:
n_inputs = env.observation_space.shape[0]
n_inputs

4

In [35]:
model = keras.models.Sequential([
    keras.layers.Dense(5, activation="elu", input_shape=[n_inputs]),
    keras.layers.Dense(1, activation="sigmoid"),
])

## Policy Gradients

In [70]:
def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        left_proba = model(obs[np.newaxis])
        action = (tf.random.uniform([1, 1]) > left_proba)
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)
        loss = tf.reduce_mean(loss_fn(y_target, left_proba))
        
    grads = tape.gradient(loss, model.trainable_variables)
    obs, reward, done,truncated, info = env.step(int(action[0, 0].numpy()))
    return obs, reward, done, grads

In [72]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs,info = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

In [64]:
def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_factor
    return discounted

In [65]:
def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = [discount_rewards(rewards, discount_factor) for rewards in all_rewards]
    print(all_discounted_rewards)
    flat_rewards = np.concatenate(all_discounted_rewards)
    print(flat_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std for discounted_rewards in all_discounted_rewards]

In [66]:
#discount_rewards([10, 0, -50], discount_factor=0.8)
discount_and_normalize_rewards([[10, 0, -50], [10, 20]], discount_factor=0.8)

[array([-22, -40, -50]), array([26, 20])]
[-22 -40 -50  26  20]


[array([-0.28435071, -0.86597718, -1.18910299]),
 array([1.26665318, 1.0727777 ])]

In [67]:
n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 200
discount_factor = 0.95

In [68]:
optimizer = keras.optimizers.legacy.Adam(learning_rate=0.01)
loss_fn = keras.losses.binary_crossentropy

In [73]:
for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(env, n_episodes_per_update, n_max_steps, model, loss_fn)
    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_factor)

    all_mean_grads = []

    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean([final_reward * all_grads[episode_index][step][var_index] for episode_index, final_rewards in enumerate(all_final_rewards)
            for step, final_reward in enumerate(final_rewards)], axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

[array([13.85286265, 13.5293291 , 13.18876747, 12.83028155, 12.45292795,
       12.05571363, 11.6375933 , 11.19746663, 10.7341754 , 10.24650042,
        9.73315833,  9.19279825,  8.62399815,  8.02526122,  7.39501181,
        6.73159137,  6.03325408,  5.29816219,  4.52438125,  3.709875  ,
        2.8525    ,  1.95      ,  1.        ]), array([9.73315833, 9.19279825, 8.62399815, 8.02526122, 7.39501181,
       6.73159137, 6.03325408, 5.29816219, 4.52438125, 3.709875  ,
       2.8525    , 1.95      , 1.        ]), array([11.6375933 , 11.19746663, 10.7341754 , 10.24650042,  9.73315833,
        9.19279825,  8.62399815,  8.02526122,  7.39501181,  6.73159137,
        6.03325408,  5.29816219,  4.52438125,  3.709875  ,  2.8525    ,
        1.95      ,  1.        ]), array([10.24650042,  9.73315833,  9.19279825,  8.62399815,  8.02526122,
        7.39501181,  6.73159137,  6.03325408,  5.29816219,  4.52438125,
        3.709875  ,  2.8525    ,  1.95      ,  1.        ]), array([11.6375933 , 11.19746