## Train Model

In [21]:
import os
import logging
from datetime import datetime
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback

logger = logging.getLogger()
logger.setLevel(logging.INFO)

from SimpleWalk2D import SimpleWalk2DDynGoal

env = SimpleWalk2DDynGoal()

In [22]:
print('Date and time:', datetime.now().strftime('%Y-%m-%d_%H%M'))

Date and time: 2022-02-24_0910


### Train Callback

In [23]:
class TrainAndLoggingCallback(BaseCallback):
    
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
        
    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok = True)
            
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)
            
        return True

### Optuna


In [24]:
# Importing the optimzation frame - HPO
import optuna
# PPO algo for RL
from stable_baselines3 import PPO
# Bring in the eval policy method for metric calculation
from stable_baselines3.common.evaluation import evaluate_policy
# Import the sb3 monitor for logging 
from stable_baselines3.common.monitor import Monitor
# Import the vec wrappers to vectorize and frame stack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
# Import os to deal with filepaths
import os


# Function to return test hyperparameters - define the object function

LOG_DIR = './train/logs/' + datetime.now().strftime('%Y-%m-%d_%H%M') + '/'
OPT_DIR = './train/opt/'+ datetime.now().strftime('%Y-%m-%d_%H%M') + '/'

def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 256, 8192, 64),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }
    
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(1))

# Run a training loop and return mean reward 
def optimize_agent(trial):
    
    # Get hyperparameters
    model_params = optimize_ppo(trial) 

    # Create environment 
    env = SimpleWalk2DDynGoal()
    # env = Monitor(env)
    # env = DummyVecEnv([lambda: env])
    # env = VecFrameStack(env, 4, channels_order='last')

    # Create algo 
    model = PPO(
        'MlpPolicy', 
        env, 
        tensorboard_log=LOG_DIR, 
        verbose=0, 
        **model_params)
    model.learn(total_timesteps=50_000)
    #model.learn(total_timesteps=100000)

    # Evaluate model 
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=20)
    env.close()

    SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
    model.save(SAVE_PATH)

    return mean_reward

    
# Creating the experiment 
study = optuna.create_study(direction='maximize')
study.optimize(
    optimize_agent, 
    n_trials=1_000,
    timeout=60*60*1,
    show_progress_bar=True,
    )
#study.optimize(optimize_agent, n_trials=100, n_jobs=1)

study.best_params

study.best_trial

#model = PPO.load(os.path.join(OPT_DIR, 'trial_5_best_model.zip'))


[32m[I 2022-02-24 09:10:25,495][0m A new study created in memory with name: no-name-aa85e5e3-acee-4a02-a802-3e381f4b4cb3[0m
  self._init_valid()
  0%|          | 1/1000 [01:12<20:11:05, 72.74s/it, 72.74/3600 seconds]

[32m[I 2022-02-24 09:11:38,237][0m Trial 0 finished with value: -1920.7687418937683 and parameters: {'n_steps': 5696, 'gamma': 0.9582611351983121, 'learning_rate': 3.318644866318253e-05, 'clip_range': 0.15986993397716537, 'gae_lambda': 0.8863496158176362}. Best is trial 0 with value: -1920.7687418937683.[0m


  0%|          | 2/1000 [02:25<20:12:59, 72.93s/it, 145.80/3600 seconds]

[32m[I 2022-02-24 09:12:51,293][0m Trial 1 finished with value: 946.3818915713579 and parameters: {'n_steps': 1152, 'gamma': 0.8281372162716393, 'learning_rate': 5.947225351527409e-05, 'clip_range': 0.34107143500589554, 'gae_lambda': 0.8940888101428192}. Best is trial 1 with value: 946.3818915713579.[0m


  0%|          | 3/1000 [03:40<20:25:34, 73.76s/it, 220.54/3600 seconds]

[32m[I 2022-02-24 09:14:06,037][0m Trial 2 finished with value: -1896.361507844925 and parameters: {'n_steps': 5248, 'gamma': 0.9246512695420032, 'learning_rate': 2.7403035783197963e-05, 'clip_range': 0.1491968491508225, 'gae_lambda': 0.9502051496910219}. Best is trial 1 with value: 946.3818915713579.[0m


In [None]:
study.best_params


{'n_steps': 4096,
 'gamma': 0.814916887081507,
 'learning_rate': 9.988928342451913e-05,
 'clip_range': 0.2805024112319538,
 'gae_lambda': 0.8322603084029443}

In [None]:
study.best_trial.number

486

In [None]:
import joblib
joblib.dump(study, OPT_DIR + 'study.pkl')

['./train/opt/study.pkl']

In [None]:
load_path = os.path.join(OPT_DIR, 'trial_{}_best_model.zip'.format(study.best_trial.number))

# model = PPO('MlpPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **study.best_params)

env = SimpleWalk2DDynGoal()
print(env.observation_space)
env = Monitor(env)

model = PPO.load(
    path = load_path,
    env = env,
    )

print(model.action_space)
print(model.observation_space)


Box([0. 0. 0. 0. 0. 0.], [20. 20. 20. 20. 20. 20.], (6,), float32)
Box([-1. -1.], [1. 1.], (2,), float32)
Box([0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], [20. 20. 20. 20. 20. 20. 20. 20. 20. 20. 20. 20. 20. 20. 20. 20. 20. 20.
 20. 20. 20. 20. 20. 20.], (24,), float32)


In [None]:
model

<stable_baselines3.ppo.ppo.PPO at 0x7f7dfa3fce50>

In [None]:
model.learn(
    total_timesteps=300_000, 
    callback=TrainAndLoggingCallback(
        check_freq=10_000, 
        save_path=LOG_DIR
        )
    )

<stable_baselines3.ppo.ppo.PPO at 0x7f7dfa36c4c0>

In [None]:
"""
env_name = 'SW2DDynGoal'

CHECKPOINT_DIR = './train/train_' + env_name
LOG_DIR = './train/log_' + env_name

callback = TrainAndLoggingCallback(check_freq=10_000, save_path=CHECKPOINT_DIR)

log_path = os.path.join('Training', 'Logs')

model = PPO(
    "MlpPolicy", 
    env, 
    verbose=0, 
    tensorboard_log=log_path,
    #learning_rate=0.0001,
    #n_steps =2048
    )
logger.setLevel(logging.INFO)

model.learn(
    total_timesteps=300_000, 
    callback = callback
    )

model.save('PPO')

logger.setLevel(logging.DEBUG)
"""

'\nenv_name = \'SW2DDynGoal\'\n\nCHECKPOINT_DIR = \'./train/train_\' + env_name\nLOG_DIR = \'./train/log_\' + env_name\n\ncallback = TrainAndLoggingCallback(check_freq=10_000, save_path=CHECKPOINT_DIR)\n\nlog_path = os.path.join(\'Training\', \'Logs\')\n\nmodel = PPO(\n    "MlpPolicy", \n    env, \n    verbose=0, \n    tensorboard_log=log_path,\n    #learning_rate=0.0001,\n    #n_steps =2048\n    )\nlogger.setLevel(logging.INFO)\n\nmodel.learn(\n    total_timesteps=300_000, \n    callback = callback\n    )\n\nmodel.save(\'PPO\')\n\nlogger.setLevel(logging.DEBUG)\n'

## Test Model

In [None]:
from stable_baselines3 import PPO
from SimpleWalk2D import SimpleWalk2DDynGoal

env = SimpleWalk2DDynGoal()


In [None]:

# test prediction
logger.setLevel(logging.DEBUG)

episodes = 10
for episode in range(episodes):
    env.reset()

    while True:
        action, _states = model.predict(env.state)
        obs, rewards, done, info = env.step(action)
        if done:
            print('done')
            print("info", info)
            break
    env.render()

DEBUG:root:
DEBUG:root:reset


ValueError: Error: Unexpected observation shape (6,) for Box environment, please use (24,) or (n_env, 24) for the observation shape.

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

evaluate_policy(model, env, n_eval_episodes=10, render=False)

# TODO render doesn't work