# Simple Walk 2D continuous

base environment on https://github.com/nicknochnack/ReinforcementLearningCourse/blob/main/Project%203%20-%20Custom%20Environment.ipynb

continuous state space environment

action space: BOX in two directions from -1 to 1

state space: box with shape(2,1)?

goal reached when distance closer than 1





## 1. Import Dependencies

In [2]:
import gym 
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

In [7]:
test_box = Box(low=0.0, high=10.0, shape=(2,2))
print(test_box)
print(type(test_box))
print(test_box.sample())
print(type(test_box.sample()))
print(test_box.sample().shape)

Box([[0. 0.]
 [0. 0.]], [[10. 10.]
 [10. 10.]], (2, 2), float32)
<class 'gym.spaces.box.Box'>
[[9.209576 8.657924]
 [9.450312 8.482261]]
<class 'numpy.ndarray'>
(2, 2)


In [16]:
state = np.ndarray(shape=(2,2), dtype=np.float32)
print(state)
print(type(state))
state = np.random.uniform(0.0, 10.0, 2)
print(state)
print(type(state))

[[3.822255   0.28385028]
 [6.005751   6.436809  ]]
<class 'numpy.ndarray'>
[3.48290359 3.23671321]
<class 'numpy.ndarray'>


In [29]:
state = np.random.uniform(0.0, 10.0, (2,2))
print(state)
print(state[0])
distance = np.linalg.norm(state[0] - state[1])
print(distance)

[[1.62057846 5.51020763]
 [1.08893779 7.11932847]]
[1.62057846 5.51020763]
1.6946715466992928


In [26]:
def out_of_bounds(state):
    return not (0.0 <= state[0] <= 10.0 and 0.0 <= state[1] <= 10.0)

position = np.random.uniform(-10.0, 10.0, (2,))
print(position)
print(out_of_bounds(position))


[0.11871922 0.05648993]
False


In [31]:
class SimpleWalk2Dcont(Env):
    """simple walk environment in 1D with a continuous action and state space"""
    def __init__(self):
        self.action_space = Box(low=-1.0, high=1.0, shape=(2, ))
        self.observation_space = Box(low=0.0, high=10.0, shape=(2, 2)) # 0 position, 1 goal
        self.state = np.ndarray(shape=(2,2), dtype=np.float32)
        
    
    def step(self, action):
        # update position
        self.state[0] += action
        
        
        position = self.state[0]
        goal = self.state[1]
        distance_to_goal = np.linalg.norm(state[0] - state[1])
        
        if out_of_bounds(position):
            # went out of bounds
            reward = -10.0
            done = True
        elif distance_to_goal < 1:
            # reached goal
            reward = 10.0
            done = True
        else:
            # stepping is penalized
            reward = -0.1
            done = False
        return self.state, reward, done, {}
    
    def reset(self):
        self.state = np.random.uniform(0.0, 10.0, (2,2))
        return self.state
    
    def render(self):
        pass
    
    def __out_of_bounds(state):
        return not (0.0 <= state[0] <= 10.0 and 0.0 <= state[1] <= 10.0)
    
env = SimpleWalk2Dcont()

print(env.action_space.sample())
print(env.observation_space.sample())

[-0.46764672 -0.6698517 ]
[[5.969993  6.1932683]
 [7.053923  9.32972  ]]


In [32]:
env = SimpleWalk2Dcont()
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:-11.3
Episode:2 Score:-12.3
Episode:3 Score:6.399999999999999
Episode:4 Score:-16.39999999999999
Episode:5 Score:-10.0


In [33]:
log_path = os.path.join('Training', 'Logs')

In [34]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [35]:
model.learn(total_timesteps=4000)

Logging to Training\Logs\PPO_9
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 15.8     |
|    ep_rew_mean     | -11.5    |
| time/              |          |
|    fps             | 436      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 26.6        |
|    ep_rew_mean          | -12.6       |
| time/                   |             |
|    fps                  | 438         |
|    iterations           | 2           |
|    time_elapsed         | 9           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.018371014 |
|    clip_fraction        | 0.194       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.8        |
|    explained_variance   | -0.0165     |

<stable_baselines3.ppo.ppo.PPO at 0x14f2fa38580>

In [None]:
model.save('PPO')

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=False)

(97.0, 2.6076809620810595)