## Train Model

In [1]:
import os
import logging
from datetime import datetime
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
# Import the sb3 monitor for logging 
from stable_baselines3.common.monitor import Monitor
logger = logging.getLogger()
logger.setLevel(logging.INFO)

from SimpleWalk2D import SimpleWalk2DDynGoal

env = SimpleWalk2DDynGoal()
env = Monitor(env)

In [2]:
print('Date and time:', datetime.now().strftime('%Y-%m-%d_%H%M'))

Date and time: 2022-02-24_1550


### Train Callback

In [3]:
class TrainAndLoggingCallback(BaseCallback):
    
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
        
    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok = True)
            
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)
            
        return True

### Optuna


In [4]:
# Importing the optimzation frame - HPO
import optuna
# PPO algo for RL
from stable_baselines3 import PPO
# Bring in the eval policy method for metric calculation
from stable_baselines3.common.evaluation import evaluate_policy

# Import the vec wrappers to vectorize and frame stack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
# Import os to deal with filepaths
import os


# Function to return test hyperparameters - define the object function

now = datetime.now().strftime('%Y-%m-%d_%H%M')

LOG_DIR = './train/logs/' + now + '/'
OPT_DIR = './train/opt/'+ now + '/'

def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 256, 8192, 64),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }
    
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(1))

# Run a training loop and return mean reward 
def optimize_agent(trial):
    
    # Get hyperparameters
    model_params = optimize_ppo(trial) 

    # Create environment 
    # env = SimpleWalk2DDynGoal()
    # env = Monitor(env)
    # env = DummyVecEnv([lambda: env])
    # env = VecFrameStack(env, 4, channels_order='last')

    # Create algo 
    model = PPO(
        'MlpPolicy', 
        env, 
        tensorboard_log=LOG_DIR, 
        verbose=0, 
        **model_params)
    model.learn(total_timesteps=10_000)
    #model.learn(total_timesteps=100000)

    # Evaluate model 
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=20)
    env.close()

    SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
    model.save(SAVE_PATH)

    return mean_reward

    
# Creating the experiment 
study = optuna.create_study(direction='maximize')
study.optimize(
    optimize_agent, 
    n_trials=1_000,
    timeout=60*3,
    show_progress_bar=True,
    )
#study.optimize(optimize_agent, n_trials=100, n_jobs=1)

study.best_params

study.best_trial

#model = PPO.load(os.path.join(OPT_DIR, 'trial_5_best_model.zip'))


[32m[I 2022-02-24 15:50:33,775][0m A new study created in memory with name: no-name-32de2f93-4dc7-451e-a8e4-4e7e408cdecb[0m
  self._init_valid()
  0%|          | 1/1000 [00:18<5:02:05, 18.14s/it, 18.14/180 seconds]

[32m[I 2022-02-24 15:50:51,919][0m Trial 0 finished with value: -2569.92988095 and parameters: {'n_steps': 2176, 'gamma': 0.8307698170355562, 'learning_rate': 3.5678144029528294e-05, 'clip_range': 0.33384557675808524, 'gae_lambda': 0.9730418504363965}. Best is trial 0 with value: -2569.92988095.[0m


  0%|          | 2/1000 [00:32<4:29:30, 16.20s/it, 32.99/180 seconds]

[32m[I 2022-02-24 15:51:06,764][0m Trial 1 finished with value: -2018.8031850500004 and parameters: {'n_steps': 3456, 'gamma': 0.8282786624430413, 'learning_rate': 9.88059620047403e-05, 'clip_range': 0.1703635184979676, 'gae_lambda': 0.8502116422706045}. Best is trial 1 with value: -2018.8031850500004.[0m


  0%|          | 3/1000 [00:52<4:54:09, 17.70s/it, 52.48/180 seconds]

[32m[I 2022-02-24 15:51:26,250][0m Trial 2 finished with value: -2481.5854370000006 and parameters: {'n_steps': 4544, 'gamma': 0.9927835890669247, 'learning_rate': 8.322061996517314e-05, 'clip_range': 0.17924213458394134, 'gae_lambda': 0.9606223730142404}. Best is trial 1 with value: -2018.8031850500004.[0m


  0%|          | 4/1000 [01:10<4:54:38, 17.75s/it, 70.30/180 seconds]

[32m[I 2022-02-24 15:51:44,072][0m Trial 3 finished with value: -2359.4829222000003 and parameters: {'n_steps': 4160, 'gamma': 0.9407309500599759, 'learning_rate': 5.734262073833127e-05, 'clip_range': 0.16597556888202913, 'gae_lambda': 0.9184701819176896}. Best is trial 1 with value: -2018.8031850500004.[0m


  0%|          | 5/1000 [01:29<5:02:31, 18.24s/it, 89.41/180 seconds]

[32m[I 2022-02-24 15:52:03,189][0m Trial 4 finished with value: -2843.3989964999996 and parameters: {'n_steps': 6720, 'gamma': 0.9251941342289149, 'learning_rate': 3.133682900204521e-05, 'clip_range': 0.2835033578136893, 'gae_lambda': 0.929397933749119}. Best is trial 1 with value: -2018.8031850500004.[0m


  1%|          | 6/1000 [01:52<5:28:56, 19.86s/it, 112.40/180 seconds]

[32m[I 2022-02-24 15:52:26,175][0m Trial 5 finished with value: -2617.88856385 and parameters: {'n_steps': 8064, 'gamma': 0.8170317826332298, 'learning_rate': 3.914967221066267e-05, 'clip_range': 0.14394579993522252, 'gae_lambda': 0.8357473854649794}. Best is trial 1 with value: -2018.8031850500004.[0m


  1%|          | 7/1000 [02:12<5:29:21, 19.90s/it, 132.39/180 seconds]

[32m[I 2022-02-24 15:52:46,168][0m Trial 6 finished with value: -2621.4537996500003 and parameters: {'n_steps': 7040, 'gamma': 0.9713169440020987, 'learning_rate': 9.38630407481769e-05, 'clip_range': 0.1896197963122627, 'gae_lambda': 0.9278688032227438}. Best is trial 1 with value: -2018.8031850500004.[0m


  1%|          | 8/1000 [02:29<5:13:34, 18.97s/it, 149.36/180 seconds]

[32m[I 2022-02-24 15:53:03,134][0m Trial 7 finished with value: -2735.9867736999995 and parameters: {'n_steps': 2368, 'gamma': 0.8054679521014703, 'learning_rate': 1.244719524238043e-05, 'clip_range': 0.1776816287833368, 'gae_lambda': 0.9603804455962007}. Best is trial 1 with value: -2018.8031850500004.[0m


  1%|          | 9/1000 [02:46<5:05:45, 18.51s/it, 166.87/180 seconds]

[32m[I 2022-02-24 15:53:20,648][0m Trial 8 finished with value: -2414.375610100001 and parameters: {'n_steps': 3072, 'gamma': 0.9411662693961969, 'learning_rate': 1.4755093112635148e-05, 'clip_range': 0.16198377934896532, 'gae_lambda': 0.8384681799964715}. Best is trial 1 with value: -2018.8031850500004.[0m


  1%|          | 10/1000 [03:03<5:02:56, 18.36s/it, 183.60/180 seconds]

[32m[I 2022-02-24 15:53:37,375][0m Trial 9 finished with value: -2596.40427605 and parameters: {'n_steps': 1920, 'gamma': 0.8100982337631407, 'learning_rate': 2.4970414393538188e-05, 'clip_range': 0.2920457073245936, 'gae_lambda': 0.907499433933655}. Best is trial 1 with value: -2018.8031850500004.[0m





FrozenTrial(number=1, values=[-2018.8031850500004], datetime_start=datetime.datetime(2022, 2, 24, 15, 50, 51, 921959), datetime_complete=datetime.datetime(2022, 2, 24, 15, 51, 6, 764001), params={'n_steps': 3456, 'gamma': 0.8282786624430413, 'learning_rate': 9.88059620047403e-05, 'clip_range': 0.1703635184979676, 'gae_lambda': 0.8502116422706045}, distributions={'n_steps': IntUniformDistribution(high=8192, low=256, step=64), 'gamma': LogUniformDistribution(high=0.9999, low=0.8), 'learning_rate': LogUniformDistribution(high=0.0001, low=1e-05), 'clip_range': UniformDistribution(high=0.4, low=0.1), 'gae_lambda': UniformDistribution(high=0.99, low=0.8)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=1, state=TrialState.COMPLETE, value=None)

In [5]:
study.best_params


{'n_steps': 3456,
 'gamma': 0.8282786624430413,
 'learning_rate': 9.88059620047403e-05,
 'clip_range': 0.1703635184979676,
 'gae_lambda': 0.8502116422706045}

In [6]:
study.best_trial.number

1

In [7]:
import plotly

In [8]:

# if optuna.visualization.is_available():
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
fig = optuna.visualization.plot_parallel_coordinate(study)
fig.show()

In [9]:
if optuna.visualization.is_available():
    fig = optuna.visualization.plot_param_importances(study)
    fig.show()

In [10]:
import joblib
joblib.dump(study, OPT_DIR + 'study.pkl')

['./train/opt/2022-02-24_1550/study.pkl']

In [11]:
load_path = os.path.join(OPT_DIR, 'trial_{}_best_model.zip'.format(study.best_trial.number))

# model = PPO('MlpPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **study.best_params)

# env = SimpleWalk2DDynGoal()
# print(env.observation_space)
# env = Monitor(env)

model = PPO.load(
    path = load_path,
    env = env,
    )

print(model.action_space)
print(model.observation_space)


Box([-1. -1.], [1. 1.], (2,), float32)
Box([0. 0. 0. 0. 0. 0.], [20. 20. 20. 20. 20. 20.], (6,), float32)


In [12]:
model

<stable_baselines3.ppo.ppo.PPO at 0x7f69b40e9a60>

In [13]:
model.learn(
    total_timesteps=300_000, 
    callback=TrainAndLoggingCallback(
        check_freq=10_000, 
        save_path=LOG_DIR
        )
    )

<stable_baselines3.ppo.ppo.PPO at 0x7f69b40e9a60>

In [14]:
"""
env_name = 'SW2DDynGoal'

CHECKPOINT_DIR = './train/train_' + env_name
LOG_DIR = './train/log_' + env_name

callback = TrainAndLoggingCallback(check_freq=10_000, save_path=CHECKPOINT_DIR)

log_path = os.path.join('Training', 'Logs')

model = PPO(
    "MlpPolicy", 
    env, 
    verbose=0, 
    tensorboard_log=log_path,
    #learning_rate=0.0001,
    #n_steps =2048
    )
logger.setLevel(logging.INFO)

model.learn(
    total_timesteps=300_000, 
    callback = callback
    )

model.save('PPO')

logger.setLevel(logging.DEBUG)
"""

'\nenv_name = \'SW2DDynGoal\'\n\nCHECKPOINT_DIR = \'./train/train_\' + env_name\nLOG_DIR = \'./train/log_\' + env_name\n\ncallback = TrainAndLoggingCallback(check_freq=10_000, save_path=CHECKPOINT_DIR)\n\nlog_path = os.path.join(\'Training\', \'Logs\')\n\nmodel = PPO(\n    "MlpPolicy", \n    env, \n    verbose=0, \n    tensorboard_log=log_path,\n    #learning_rate=0.0001,\n    #n_steps =2048\n    )\nlogger.setLevel(logging.INFO)\n\nmodel.learn(\n    total_timesteps=300_000, \n    callback = callback\n    )\n\nmodel.save(\'PPO\')\n\nlogger.setLevel(logging.DEBUG)\n'

## Test Model

In [15]:
# from stable_baselines3 import PPO
# from SimpleWalk2D import SimpleWalk2DDynGoal

# env = SimpleWalk2DDynGoal()


In [16]:

# test prediction
logger.setLevel(logging.DEBUG)

episodes = 10
for episode in range(episodes):
    env.reset()

    while True:
        action, _states = model.predict(env.state)
        obs, rewards, done, info = env.step(action)
        if done:
            print('done')
            print("info", info)
            break
    env.render()

DEBUG:root:
DEBUG:root:reset
DEBUG:root:movement: 1.2534112930297852
DEBUG:root:movement: 1.6349587440490723
DEBUG:root:vector 1: [ 1.        -0.9670515]
DEBUG:root:vector 2: [ 1.        -0.8568659]
DEBUG:root:angle: 0.060182688895031276
DEBUG:root:movement: 1.533355474472046
DEBUG:root:vector 1: [ 1.        -0.8568659]
DEBUG:root:vector 2: [ 1.       -0.892787]
DEBUG:root:angle: 0.020350610921347927
DEBUG:root:movement: 1.4669404029846191
DEBUG:root:vector 1: [ 1.       -0.892787]
DEBUG:root:vector 2: [1.         0.39598656]
DEBUG:root:angle: 1.1058574227583382
DEBUG:root:reached goal


done
info {'distance_to_goal': 1.4022648, 'steps_taken': 5, 'previous_state': array([11.201361,  9.683871, 11.114122,  9.942972, 11.58969 ,  9.91863 ],
      dtype=float32), 'new_state': array([11.201361,  9.683871, 11.114122,  9.942972, 11.58969 ,  9.91863 ],
      dtype=float32), 'episode': {'r': 1059.764076, 'l': 5, 't': 614.633896}}


TypeError: render() takes 1 positional argument but 2 were given

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

evaluate_policy(model, env, n_eval_episodes=10, render=False)

# TODO render doesn't work