# Simple Walk 1D

base environment on https://github.com/nicknochnack/ReinforcementLearningCourse/blob/main/Project%203%20-%20Custom%20Environment.ipynb

continuous state space environment

action space: BOX from -1 to 1

state space: tuple of 2 boxes for goal and position from 0 to 10

goal reached when distance closer than 1





## 1. Import Dependencies

In [14]:
import gym 
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

In [15]:
class SimpleWalk1Dcont(Env):
    def __init__(self):
        self.action_space = Box(low=-1.0, high=1.0, shape=(1,))
        self.observation_space = Dict({
            "position": Box(low=0.0, high=10.0, shape=(1,)),
            "goal": Box(low=0.0, high=10.0, shape=(1,))
            })
        self.state = {"position": 0.0, "goal": 10.0}
        
    
    def step(self, action):
        self.state["position"] += action
        
        distance_to_goal = abs(self.state["position"] - self.state["goal"])
        if self.state["position"] < 0.0 or self.state["position"] > 10.0:
            # went out of bounds
            reward = -10.0
            done = True
        elif distance_to_goal < 1:
            # reached goal
            reward = 10.0
            done = True
        else:
            # stepping is penalized
            reward = -0.1
            done = False
        return self.state, reward, done, {}
    
    def reset(self):
        self.state = {"position": random.uniform(0.0, 10.0), "goal": random.uniform(0.0, 10.0)}
    
    def render(self):
        pass
    
    

In [28]:
class SimpleWalk1Dcont(Env):
    def __init__(self):
        self.action_space = Box(low=-1.0, high=1.0, shape=(1,))
        self.observation_space = Box(low=0.0, high=10.0, shape=(2,)) # 0 position, 1 goal
        self.state = (0.0, 0.0)
        
    
    def step(self, action):
        # update position
        self.state[0] += action
        
        position = self.state[1]
        goal = self.state[1]
        distance_to_goal = abs(position - goal)
        if position < 0.0 or position > 10.0:
            # went out of bounds
            reward = -10.0
            done = True
        elif distance_to_goal < 1:
            # reached goal
            reward = 10.0
            done = True
        else:
            # stepping is penalized
            reward = -0.1
            done = False
        return self.state, reward, done, {}
    
    def reset(self):
        self.state = (random.uniform(0.0, 10.0), random.uniform(0.0, 10.0))
    
    def render(self):
        pass
    
env = SimpleWalk1Dcont()

print(env.action_space.sample())
print(env.observation_space.sample())

[-0.31184557]
[8.38541   1.0655336]


In [29]:
env = SimpleWalk1Dcont()
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

TypeError: 'tuple' object does not support item assignment

In [17]:
log_path = os.path.join('Training', 'Logs')

In [18]:
model = PPO("MultiInputPolicy", env, verbose=1, tensorboard_log=log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [21]:
model.learn(total_timesteps=4000)

TypeError: 'NoneType' object is not subscriptable

In [None]:
model.save('PPO')

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=False)

(97.0, 2.6076809620810595)