In [None]:
#!git clone git@github.com:micheltokic/crawlingrobot.git
#!pip install -e crawlingrobot/

In [1]:
import sys
import numpy as np
import pygame
import os
#os.environ['SDL_VIDEODRIVER']='dummy'
import gym
import gym_crawlingrobot

pygame 2.1.2 (SDL 2.0.18, Python 3.7.9)
Hello from the pygame community. https://www.pygame.org/contribute.html


# 1) Manual

In [2]:
def manual_robot_control (env):
    
    done = False
    action = None
    env.reset()
    cum_reward = 0
    step = 0

    while True:
        # process pygame event loop
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                return
            elif event.type == pygame.KEYDOWN:
                if event.key == pygame.K_ESCAPE:
                    pygame.quit()
                    return
                elif event.key == pygame.K_UP or event.key == pygame.K_w:
                    env.step(0)
                    action = 0
                elif event.key == pygame.K_RIGHT or event.key == pygame.K_d:
                    action = 1
                elif event.key == pygame.K_DOWN or event.key == pygame.K_s:
                    action = 2
                elif event.key == pygame.K_LEFT or event.key == pygame.K_a:
                    action = 3
                elif event.key == pygame.K_r:
                    env.reset()
                    action = 3
                elif event.key == pygame.K_SPACE:
                    env.robot.render_intermediate_steps = not env.robot.render_intermediate_steps

                if action:
                    obs, reward, done, info = env.step(action)
                    action = None
                    cum_reward += reward
                    step += 1
                    print (f"step={step}, action={action}, reward={reward:.2f}, cum_reward={cum_reward:.2f}, done={done}")
                if done:
                    env.reset()
                    action = 3
                    cum_reward = 0
                    step = 0

            env.render()


In [3]:
env = gym.make('crawlingrobot-discrete-v1', rotation_angles=5, goal_distance=700, window_size=(640, 480), render_intermediate_steps=True, plot_steps_per_episode=True)
env.robot.mode = 2 # => Use WASD or Arrow Keys to control the robot's arms
manual_robot_control (env)
pygame.quit()

step=1, action=None, reward=0.99, cum_reward=0.99, done=False
step=2, action=None, reward=0.01, cum_reward=1.00, done=False
step=3, action=None, reward=7.24, cum_reward=8.24, done=False
step=4, action=None, reward=13.38, cum_reward=21.63, done=False
step=5, action=None, reward=22.67, cum_reward=44.29, done=False
step=6, action=None, reward=23.59, cum_reward=67.89, done=False
step=7, action=None, reward=18.93, cum_reward=86.81, done=False
step=8, action=None, reward=6.89, cum_reward=93.70, done=False
step=9, action=None, reward=-0.03, cum_reward=93.67, done=False
step=10, action=None, reward=0.02, cum_reward=93.69, done=False
step=11, action=None, reward=0.01, cum_reward=93.70, done=False
step=12, action=None, reward=0.01, cum_reward=93.71, done=False
step=13, action=None, reward=0.00, cum_reward=93.71, done=False
step=14, action=None, reward=-0.02, cum_reward=93.69, done=False
step=15, action=None, reward=0.01, cum_reward=93.70, done=False
step=16, action=None, reward=-0.01, cum_reward

step=129, action=None, reward=0.01, cum_reward=472.77, done=False
step=130, action=None, reward=0.00, cum_reward=472.77, done=False
step=131, action=None, reward=6.81, cum_reward=479.58, done=False
step=132, action=None, reward=14.24, cum_reward=493.83, done=False
step=133, action=None, reward=0.00, cum_reward=493.83, done=False
step=134, action=None, reward=0.00, cum_reward=493.83, done=False
step=135, action=None, reward=0.00, cum_reward=493.83, done=False
step=136, action=None, reward=22.75, cum_reward=516.58, done=False
step=137, action=None, reward=23.18, cum_reward=539.76, done=False
step=138, action=None, reward=17.48, cum_reward=557.24, done=True
step=1, action=None, reward=0.97, cum_reward=0.97, done=False
step=2, action=None, reward=0.01, cum_reward=0.98, done=False
step=3, action=None, reward=-0.01, cum_reward=0.97, done=False
step=4, action=None, reward=0.01, cum_reward=0.97, done=False
step=5, action=None, reward=-0.02, cum_reward=0.95, done=False


# 2) Q-Learning with discrete actions

In [4]:
# function maps the 2D observation (x, y) to a single state number n 
def obs_to_number(obs, obs_max):
    return int(obs[0] * obs_max + obs[1])

def q_agent(Q, obs_max, env, learn=True, render=False, alpha=1, gamma=0.95, epsilon=0.2, maxSteps=10000, episodes=200):
    
    print (f"Q.shape={Q.shape}")
    np.set_printoptions(threshold=sys.maxsize)

    for episode in range (episodes):
        done = False
        init_obs = env.reset()
        state = obs_to_number(init_obs, obs_max)
        step = 0
        cum_reward =0 

        while not done and step < maxSteps:
            
                # process pygame event loop
                for event in pygame.event.get():
                    if event.type == pygame.QUIT:
                        pygame.quit()
                        return
                    elif event.type == pygame.KEYDOWN:
                        if event.key == pygame.K_ESCAPE:
                            pygame.quit()
                            return
                        if event.key == pygame.K_SPACE:
                            env.robot.render_intermediate_steps = not env.robot.render_intermediate_steps

                # action selection
                if np.random.rand() < epsilon:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(Q[state])

                # perform action in environment
                nextObs, reward, done, _ = env.step(action)
                cum_reward += reward
                
                if render:
                    env.render()

                nextState = obs_to_number(nextObs, obs_max)

                # Q-learning
                if learn:
                    Q[state, action] += alpha * (reward + gamma * np.max(Q[nextState]) - Q[state, action])

                # time transition
                state = nextState
                step += 1
        res = 0
        if len(env.robot.episode_time_results) > 0:
            res = env.robot.episode_time_results[-1]
        print(f"episode={episode} took {step} steps => cumulative reward: {cum_reward:.2f}")
        
    pygame.quit()
    return

#########################
# Initialize Q function
#########################
obs_max = env.observation_space.high[0] + 1  # currently 5
Q = np.zeros([obs_max ** len(env.observation_space.high), env.action_space.n])

## 2.1) Learn Q function

In [5]:
pygame.quit()

# instantiate environment
env = gym.make('crawlingrobot-discrete-v1', rotation_angles=5, goal_distance=700)

# learn Q function
q_agent(Q=Q, obs_max=obs_max, env=env, gamma=0.9, epsilon=0.1, episodes=100, render=False, learn=True)

Q.shape=(25, 4)
episode=0 took 1028 steps => cumulative reward: 517.27
episode=1 took 156 steps => cumulative reward: 512.80
episode=2 took 167 steps => cumulative reward: 510.91
episode=3 took 226 steps => cumulative reward: 510.32
episode=4 took 149 steps => cumulative reward: 510.94
episode=5 took 163 steps => cumulative reward: 515.71
episode=6 took 121 steps => cumulative reward: 510.71
episode=7 took 117 steps => cumulative reward: 522.51
episode=8 took 118 steps => cumulative reward: 514.04
episode=9 took 106 steps => cumulative reward: 523.96
episode=10 took 108 steps => cumulative reward: 522.98
episode=11 took 98 steps => cumulative reward: 512.74
episode=12 took 116 steps => cumulative reward: 513.77
episode=13 took 111 steps => cumulative reward: 519.10
episode=14 took 123 steps => cumulative reward: 527.61
episode=15 took 105 steps => cumulative reward: 520.53
episode=16 took 99 steps => cumulative reward: 514.06
episode=17 took 90 steps => cumulative reward: 511.06
episod

## 2.2) Evaluate policy derived from Q function

In [6]:
# evalue Q function
pygame.quit()
env = gym.make('crawlingrobot-discrete-v1', rotation_angles=5, goal_distance=700, window_size=(640, 480), plot_steps_per_episode=True)
q_agent(Q=Q, obs_max=obs_max, env=env, episodes=20, epsilon=0, render=True, learn=False)

Q.shape=(25, 4)
episode=0 took 75 steps => cumulative reward: 527.24
episode=1 took 75 steps => cumulative reward: 527.24
episode=2 took 75 steps => cumulative reward: 527.24


# 3) PPO2 control with continuous actions

In [None]:
#!pip install stable-baselines
#!pip install tensorflow==1.15

In [None]:
import pygame
import sys
import os
import numpy as np
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines.bench import Monitor
from stable_baselines import PPO2
import gym
import gym_crawlingrobot

In [None]:
# define callback class for event loop cleanup
from stable_baselines.common.callbacks import BaseCallback

class PyGameEventLoopCallback(BaseCallback):
    
    render = False
    
    def __init__(self, verbose=0, render=False):
        super(PyGameEventLoopCallback, self).__init__(verbose)
        self.render = render
        # Those variables will be accessible in the callback
        # (they are defined in the base class)
        # The RL model
        # self.model = None  # type: BaseRLModel
        # An alias for self.model.get_env(), the environment used for training
        # self.training_env = None  # type: Union[gym.Env, VecEnv, None]
        # Number of time the callback was called
        # self.n_calls = 0  # type: int
        # self.num_timesteps = 0  # type: int
        # local and global variables
        # self.locals = None  # type: Dict[str, Any]
        # self.globals = None  # type: Dict[str, Any]
        # The logger object, used to report things in the terminal
        # self.logger = None  # type: logger.Logger
        # # Sometimes, for event callback, it is useful
        # # to have access to the parent object
        # self.parent = None  # type: Optional[BaseCallback]
    
    def _on_step(self) -> bool:
        
        robot_env = self.training_env.venv.envs[0]
        
        # process pygame event loop
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                return
            elif event.type == pygame.KEYDOWN:
                if event.key == pygame.K_ESCAPE:
                    pygame.quit()
                    return
                if event.key == pygame.K_SPACE:
                    robot_env.robot.render_intermediate_steps = not robot_env.robot.render_intermediate_steps

        if self.render:
            robot_env.render()
        
        return bool

In [None]:
log_dir = ""
ppo2 = "ppo2"
os.makedirs(ppo2, exist_ok=True)

def ppo2_learn(env, render=False):
    env = VecNormalize(DummyVecEnv([lambda: Monitor(env, log_dir)]), norm_obs=True, norm_reward=True)
    model = PPO2(MlpPolicy, env, verbose=1, learning_rate=0.15)
    model.learn(total_timesteps=30000, callback=PyGameEventLoopCallback(render=render))
    
    #model.learn(total_timesteps=30000)
    model.save("ppo2/ppo2_crawling_robot")
    env.save("ppo2/vec_normalize.pkl")
 
    del model, env


def ppo2_run_policy(env, render=False, episodes=1):
    env = DummyVecEnv([lambda: Monitor(env, log_dir)])
    env = VecNormalize.load("ppo2/vec_normalize.pkl", env)
    model = PPO2.load("ppo2/ppo2_crawling_robot")

    # visualization callback
    cb = PyGameEventLoopCallback(render=render)
    cb.training_env = env

    env.training = False
    
    for e in range (episodes): 

        obs = env.reset()
        done = False
        cum_reward = 0
        step = 0

        while not done:
            action, _states = model.predict(obs)
            obs, _reward, done, info = env.step(action)
            reward = env.get_original_reward() # returns the last unnormalized reward
            cb._on_step()
            cum_reward += reward[0]
            step += 1
            print (f"episode={e}, step={step}, action={action}, reward={reward[0]:.2f}, cum_reward={cum_reward:.2f}, done={done}")


### Train policy for 30000 timesteps (no GUI)

In [None]:
robot_env = gym.make('crawlingrobot-continuous-v1', goal_distance=2500)
ppo2_learn(env=robot_env, render=False)
pygame.quit()

### Evaluate policy (with GUI)

In [None]:
#robot_env_nogui = gym.make('crawlingrobot-continuous-v1', goal_distance=2500, plot_steps_per_episode=False, render_intermediate_steps=False)
robot_env_gui = gym.make('crawlingrobot-continuous-v1', goal_distance=700, window_size=(640, 480), plot_steps_per_episode=True, render_intermediate_steps=True)
ppo2_run_policy(env=robot_env_gui, episodes=30, render=True)
pygame.quit()