## Train Model

In [1]:
import os
import logging
from datetime import datetime
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
# Import the sb3 monitor for logging 
from stable_baselines3.common.monitor import Monitor
logger = logging.getLogger()
logger.setLevel(logging.INFO)

from carla_env import CarlaVehicleEnv, read_IP_from_file

env = CarlaVehicleEnv(verbose=False, host=read_IP_from_file(file_name='../ip-host.txt'))
env.max_tick_count = 20*60
env = Monitor(env)

DEBUG:root:waiting for server
DEBUG:root:try to connect to server 137.250.121.29
DEBUG:root:server connected
INFO:root:We want to use Town01, but the map is named Carla/Maps/Town10HD_Opt


IP: 137.250.121.29


INFO:root:Map Carla/Maps/Town01 loaded
DEBUG:root:spawning walker at Location(x=131.729736, y=59.330017, z=0.300000)
DEBUG:root:created vehicle.tesla.model3
DEBUG:root:created sensor.camera.semantic_segmentation


In [2]:
print('Date and time:', datetime.now().strftime('%Y-%m-%d_%H%M'))

Date and time: 2022-03-07_1727


### Train Callback

In [3]:
class TrainAndLoggingCallback(BaseCallback):
    
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
        
    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok = True)
            
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)
            
        return True

DEBUG:root:=== creating observation ===
DEBUG:root:=== observation created ===


In [4]:
# from stable_baselines3 import PPO
# # Bring in the eval policy method for metric calculation
# from stable_baselines3.common.evaluation import evaluate_policy

now = datetime.now().strftime('%Y-%m-%d_%H%M')

LOG_DIR = './tmp/train/logs/' + now + '/'
OPT_DIR = './tmp/train/opt/' + now + '/'

SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(1))

model = PPO(
    'MlpPolicy',
    env,
    tensorboard_log=LOG_DIR,
    verbose=1,
    # **model_params
)

model.load('./tmp/train/logs/2022-03-07_1615/best_model_330000', env, verbose=1, tensoboard_log=LOG_DIR)

model.learn(
    total_timesteps=6_000_000, 
    callback=TrainAndLoggingCallback(
        check_freq=10_000, 
        save_path=LOG_DIR
        )
    )

env.close()


Using cuda device
Wrapping the env in a DummyVecEnv.


### Optuna


In [None]:
env.close()


In [None]:
# Importing the optimzation frame - HPO
import optuna
# PPO algo for RL
from stable_baselines3 import PPO
# Bring in the eval policy method for metric calculation
from stable_baselines3.common.evaluation import evaluate_policy

# Import the vec wrappers to vectorize and frame stack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
# Import os to deal with filepaths
import os


# Function to return test hyperparameters - define the object function

now = datetime.now().strftime('%Y-%m-%d_%H%M')

LOG_DIR = './train/logs/' + now + '/'
OPT_DIR = './train/opt/'+ now + '/'

def optimize_ppo(trial): 
    return {
        'n_steps':trial.suggest_int('n_steps', 256, 8192, 64),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-2),
        'clip_range':trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }
    
SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(1))

# Run a training loop and return mean reward 
def optimize_agent(trial):
    
    # Get hyperparameters
    model_params = optimize_ppo(trial) 

    # Create environment 
    # env = SimpleWalk2DDynGoal()
    # env = Monitor(env)
    # env = DummyVecEnv([lambda: env])
    # env = VecFrameStack(env, 4, channels_order='last')

    # Create algo 
    model = PPO(
        'MlpPolicy', 
        env, 
        tensorboard_log=LOG_DIR, 
        verbose=0, 
        **model_params)
    model.learn(total_timesteps=100_000)
    #model.learn(total_timesteps=100000)

    # Evaluate model 
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=30)
    env.close()

    SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
    model.save(SAVE_PATH)

    return mean_reward

    
# Creating the experiment 
study = optuna.create_study(direction='maximize')
study.optimize(
    optimize_agent, 
    n_trials=1_000,
    timeout=60*60*12,
    show_progress_bar=True,
    )
#study.optimize(optimize_agent, n_trials=100, n_jobs=1)

study.best_params

study.best_trial

#model = PPO.load(os.path.join(OPT_DIR, 'trial_5_best_model.zip'))

print('Finished', datetime.now().strftime('%Y-%m-%d_%H%M'))

In [None]:
study.best_params


In [None]:
study.best_trial.number

In [None]:
import plotly

In [None]:

# if optuna.visualization.is_available():
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
fig = optuna.visualization.plot_parallel_coordinate(study)
fig.show()

In [None]:
if optuna.visualization.is_available():
    fig = optuna.visualization.plot_param_importances(study)
    fig.show()

In [None]:
import joblib
joblib.dump(study, OPT_DIR + 'study.pkl')

In [None]:
load_path = os.path.join(OPT_DIR, 'trial_{}_best_model.zip'.format(study.best_trial.number))

# model = PPO('MlpPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **study.best_params)

# env = SimpleWalk2DDynGoal()
# print(env.observation_space)
# env = Monitor(env)

model = PPO.load(
    path = load_path,
    env = env,
    )

print(model.action_space)
print(model.observation_space)


In [None]:
model

In [None]:
model.learn(
    total_timesteps=500_000, 
    callback=TrainAndLoggingCallback(
        check_freq=10_000, 
        save_path=LOG_DIR
        )
    )

In [None]:
"""
env_name = 'SW2DDynGoal'

CHECKPOINT_DIR = './train/train_' + env_name
LOG_DIR = './train/log_' + env_name

callback = TrainAndLoggingCallback(check_freq=10_000, save_path=CHECKPOINT_DIR)

log_path = os.path.join('Training', 'Logs')

model = PPO(
    "MlpPolicy", 
    env, 
    verbose=0, 
    tensorboard_log=log_path,
    #learning_rate=0.0001,
    #n_steps =2048
    )
logger.setLevel(logging.INFO)

model.learn(
    total_timesteps=300_000, 
    callback = callback
    )

model.save('PPO')

logger.setLevel(logging.DEBUG)
"""

## Test Model

In [None]:
# from stable_baselines3 import PPO
# from SimpleWalk2D import SimpleWalk2DDynGoal

#env = SimpleWalk2DDynGoal()


In [None]:

# test prediction
logger.setLevel(logging.DEBUG)

env = CarlaWalkerEnv(verbose=False, host=read_IP_from_file(file_name='../ip-host.txt'))
env.max_tick_count = 20*60
env = Monitor(env)

episodes = 10
for episode in range(episodes):
    env.reset()

    while True:
        action, _states = model.predict(env.observation)
        obs, rewards, done, info = env.step(action)
        if done:
            print('done')
            print("info", info)
            break
    env.render()
    
env.close()

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

evaluate_policy(model, env, n_eval_episodes=10, render=False)

# TODO render doesn't work