diff --git a/ding/entry/__init__.py b/ding/entry/__init__.py index e0501b12db..15935c5793 100644 --- a/ding/entry/__init__.py +++ b/ding/entry/__init__.py @@ -5,7 +5,6 @@ from .serial_entry_onpolicy import serial_pipeline_onpolicy from .serial_entry_onpolicy_ppg import serial_pipeline_onpolicy_ppg from .serial_entry_offline import serial_pipeline_offline -from .serial_entry_ngu import serial_pipeline_ngu from .serial_entry_decision_transformer import serial_pipeline_dt from .serial_entry_reward_model_offpolicy import serial_pipeline_reward_model_offpolicy from .serial_entry_reward_model_onpolicy import serial_pipeline_reward_model_onpolicy @@ -17,13 +16,7 @@ from .application_entry import eval, collect_demo_data, collect_episodic_demo_data, \ episode_to_transitions, episode_to_transitions_filter from .application_entry_trex_collect_data import trex_collecting_data, collect_episodic_demo_data_for_trex -from .serial_entry_guided_cost import serial_pipeline_guided_cost -from .serial_entry_gail import serial_pipeline_gail from .utils import random_collect -from .serial_entry_preference_based_irl \ - import serial_pipeline_preference_based_irl -from .serial_entry_preference_based_irl_onpolicy \ - import serial_pipeline_preference_based_irl_onpolicy from .application_entry_drex_collect_data import drex_collecting_data from .serial_entry_mbrl import serial_pipeline_dyna, serial_pipeline_dream from .serial_entry_bco import serial_pipeline_bco diff --git a/ding/entry/serial_entry_gail.py b/ding/entry/serial_entry_gail.py deleted file mode 100644 index 4060291fac..0000000000 --- a/ding/entry/serial_entry_gail.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional, Tuple -import os -import torch -from ditk import logging -from functools import partial -from tensorboardX import SummaryWriter -from copy import deepcopy -import numpy as np - -from ding.envs import get_vec_env_setting, create_env_manager -from ding.worker import BaseLearner, InteractionSerialEvaluator, BaseSerialCommander, create_buffer, \ - create_serial_collector -from ding.config import read_config, compile_config -from ding.policy import create_policy -from ding.reward_model import create_reward_model -from ding.utils import set_pkg_seed -from ding.entry import collect_demo_data -from ding.utils import save_file -from .utils import random_collect - - -def save_reward_model(path, reward_model, weights_name='best'): - path = os.path.join(path, 'reward_model', 'ckpt') - if not os.path.exists(path): - try: - os.makedirs(path) - except FileExistsError: - pass - path = os.path.join(path, 'ckpt_{}.pth.tar'.format(weights_name)) - state_dict = reward_model.state_dict() - save_file(path, state_dict) - print('Saved reward model ckpt in {}'.format(path)) - - -def serial_pipeline_gail( - input_cfg: Tuple[dict, dict], - expert_cfg: Tuple[dict, dict], - seed: int = 0, - model: Optional[torch.nn.Module] = None, - max_train_iter: Optional[int] = int(1e10), - max_env_step: Optional[int] = int(1e10), - collect_data: bool = True, -) -> 'Policy': # noqa - """ - Overview: - Serial pipeline entry for GAIL reward model. - Arguments: - - input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \ - ``str`` type means config file path. \ - ``Tuple[dict, dict]`` type means [user_config, create_cfg]. - - expert_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Expert config in dict type. \ - ``str`` type means config file path. \ - ``Tuple[dict, dict]`` type means [user_config, create_cfg]. - - seed (:obj:`int`): Random seed. - - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. - - max_train_iter (:obj:`Optional[int]`): Maximum policy update iterations in training. - - max_env_step (:obj:`Optional[int]`): Maximum collected environment interaction steps. - - collect_data (:obj:`bool`): Collect expert data. - Returns: - - policy (:obj:`Policy`): Converged policy. - """ - if isinstance(input_cfg, str): - cfg, create_cfg = read_config(input_cfg) - else: - cfg, create_cfg = deepcopy(input_cfg) - if isinstance(expert_cfg, str): - expert_cfg, expert_create_cfg = read_config(expert_cfg) - else: - expert_cfg, expert_create_cfg = expert_cfg - create_cfg.policy.type = create_cfg.policy.type + '_command' - cfg = compile_config(cfg, seed=seed, auto=True, create_cfg=create_cfg, save_cfg=True) - if 'data_path' not in cfg.reward_model: - cfg.reward_model.data_path = cfg.exp_name - # Load expert data - if collect_data: - if expert_cfg.policy.get('other', None) is not None and expert_cfg.policy.other.get('eps', None) is not None: - expert_cfg.policy.other.eps.collect = -1 - if expert_cfg.policy.get('load_path', None) is None: - expert_cfg.policy.load_path = cfg.reward_model.expert_model_path - collect_demo_data( - (expert_cfg, expert_create_cfg), - seed, - state_dict_path=expert_cfg.policy.load_path, - expert_data_path=cfg.reward_model.data_path + '/expert_data.pkl', - collect_count=cfg.reward_model.collect_count - ) - # Create main components: env, policy - env_fn, collector_env_cfg, evaluator_env_cfg = get_vec_env_setting(cfg.env) - collector_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in collector_env_cfg]) - evaluator_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in evaluator_env_cfg]) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed, dynamic_seed=False) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - policy = create_policy(cfg.policy, model=model, enable_field=['learn', 'collect', 'eval', 'command']) - - # Create worker components: learner, collector, evaluator, replay buffer, commander. - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial')) - learner = BaseLearner(cfg.policy.learn.learner, policy.learn_mode, tb_logger, exp_name=cfg.exp_name) - collector = create_serial_collector( - cfg.policy.collect.collector, - env=collector_env, - policy=policy.collect_mode, - tb_logger=tb_logger, - exp_name=cfg.exp_name - ) - evaluator = InteractionSerialEvaluator( - cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode, tb_logger, exp_name=cfg.exp_name - ) - replay_buffer = create_buffer(cfg.policy.other.replay_buffer, tb_logger=tb_logger, exp_name=cfg.exp_name) - commander = BaseSerialCommander( - cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode - ) - reward_model = create_reward_model(cfg.reward_model, policy.collect_mode.get_attribute('device'), tb_logger) - - # ========== - # Main loop - # ========== - # Learner's before_run hook. - learner.call_hook('before_run') - - # Accumulate plenty of data at the beginning of training. - if cfg.policy.get('random_collect_size', 0) > 0: - random_collect(cfg.policy, policy, collector, collector_env, commander, replay_buffer) - best_reward = -np.inf - while True: - collect_kwargs = commander.step() - # Evaluate policy performance - if evaluator.should_eval(learner.train_iter): - stop, reward = evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep) - reward_mean = np.array([r['eval_episode_return'] for r in reward]).mean() - if reward_mean >= best_reward: - save_reward_model(cfg.exp_name, reward_model, 'best') - best_reward = reward_mean - if stop: - break - new_data_count, target_new_data_count = 0, cfg.reward_model.get('target_new_data_count', 1) - while new_data_count < target_new_data_count: - new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs) - new_data_count += len(new_data) - # collect data for reward_model training - reward_model.collect_data(new_data) - replay_buffer.push(new_data, cur_collector_envstep=collector.envstep) - # update reward_model - reward_model.train() - reward_model.clear_data() - # Learn policy from collected data - for i in range(cfg.policy.learn.update_per_collect): - # Learner will train ``update_per_collect`` times in one iteration. - train_data = replay_buffer.sample(learner.policy.get_attribute('batch_size'), learner.train_iter) - if train_data is None: - # It is possible that replay buffer's data count is too few to train ``update_per_collect`` times - logging.warning( - "Replay buffer's data can only train for {} steps. ".format(i) + - "You can modify data collect config, e.g. increasing n_sample, n_episode." - ) - break - # update train_data reward using the augmented reward - train_data_augmented = reward_model.estimate(train_data) - learner.train(train_data_augmented, collector.envstep) - if learner.policy.get_attribute('priority'): - replay_buffer.update(learner.priority_info) - if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter: - break - - # Learner's after_run hook. - learner.call_hook('after_run') - save_reward_model(cfg.exp_name, reward_model, 'last') - # evaluate - # evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep) - return policy diff --git a/ding/entry/serial_entry_guided_cost.py b/ding/entry/serial_entry_guided_cost.py deleted file mode 100644 index a66f4535a2..0000000000 --- a/ding/entry/serial_entry_guided_cost.py +++ /dev/null @@ -1,162 +0,0 @@ -from typing import Union, Optional, List, Any, Tuple -import os -import copy -import torch -from ditk import logging -from functools import partial -from tensorboardX import SummaryWriter -from copy import deepcopy - -from ding.envs import get_vec_env_setting, create_env_manager -from ding.worker import BaseLearner, InteractionSerialEvaluator, BaseSerialCommander, create_buffer, \ - create_serial_collector -from ding.config import read_config, compile_config -from ding.policy import create_policy -from ding.reward_model import create_reward_model -from ding.utils import set_pkg_seed, save_file -from .utils import random_collect - - -def serial_pipeline_guided_cost( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - env_setting: Optional[List[Any]] = None, - model: Optional[torch.nn.Module] = None, - expert_model: Optional[torch.nn.Module] = None, - max_train_iter: Optional[int] = int(1e10), - max_env_step: Optional[int] = int(1e10), -) -> 'Policy': # noqa - """ - Overview: - Serial pipeline guided cost: we create this serial pipeline in order to\ - implement guided cost learning in DI-engine. For now, we support the following envs\ - Cartpole, Lunarlander, Hopper, Halfcheetah, Walker2d. The demonstration\ - data come from the expert model. We use a well-trained model to \ - generate demonstration data online - Arguments: - - input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \ - ``str`` type means config file path. \ - ``Tuple[dict, dict]`` type means [user_config, create_cfg]. - - seed (:obj:`int`): Random seed. - - env_setting (:obj:`Optional[List[Any]]`): A list with 3 elements: \ - ``BaseEnv`` subclass, collector env config, and evaluator env config. - - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. - - expert_model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module.\ - The default model is DQN(**cfg.policy.model) - - max_train_iter (:obj:`Optional[int]`): Maximum policy update iterations in training. - - max_env_step (:obj:`Optional[int]`): Maximum collected environment interaction steps. - Returns: - - policy (:obj:`Policy`): Converged policy. - """ - if isinstance(input_cfg, str): - cfg, create_cfg = read_config(input_cfg) - else: - cfg, create_cfg = deepcopy(input_cfg) - create_cfg.policy.type = create_cfg.policy.type + '_command' - env_fn = None if env_setting is None else env_setting[0] - cfg = compile_config(cfg, seed=seed, env=env_fn, auto=True, create_cfg=create_cfg, save_cfg=True) - # Create main components: env, policy - if env_setting is None: - env_fn, collector_env_cfg, evaluator_env_cfg = get_vec_env_setting(cfg.env) - else: - env_fn, collector_env_cfg, evaluator_env_cfg = env_setting - collector_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in collector_env_cfg]) - expert_collector_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in collector_env_cfg]) - evaluator_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in evaluator_env_cfg]) - expert_collector_env.seed(cfg.seed) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed, dynamic_seed=False) - expert_policy = create_policy(cfg.policy, model=expert_model, enable_field=['learn', 'collect']) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - policy = create_policy(cfg.policy, model=model, enable_field=['learn', 'collect', 'eval', 'command']) - expert_policy.collect_mode.load_state_dict(torch.load(cfg.policy.collect.model_path, map_location='cpu')) - # Create worker components: learner, collector, evaluator, replay buffer, commander. - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial')) - learner = BaseLearner(cfg.policy.learn.learner, policy.learn_mode, tb_logger, exp_name=cfg.exp_name) - collector = create_serial_collector( - cfg.policy.collect.collector, - env=collector_env, - policy=policy.collect_mode, - tb_logger=tb_logger, - exp_name=cfg.exp_name - ) - expert_collector = create_serial_collector( - cfg.policy.collect.collector, - env=expert_collector_env, - policy=expert_policy.collect_mode, - tb_logger=tb_logger, - exp_name=cfg.exp_name - ) - evaluator = InteractionSerialEvaluator( - cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode, tb_logger, exp_name=cfg.exp_name - ) - replay_buffer = create_buffer(cfg.policy.other.replay_buffer, tb_logger=tb_logger, exp_name=cfg.exp_name) - expert_buffer = create_buffer(cfg.policy.other.replay_buffer, tb_logger=tb_logger, exp_name=cfg.exp_name) - commander = BaseSerialCommander( - cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode - ) - - reward_model = create_reward_model(cfg.reward_model, policy.collect_mode.get_attribute('device'), tb_logger) - # ========== - # Main loop - # ========== - # Learner's before_run hook. - learner.call_hook('before_run') - - # Accumulate plenty of data at the beginning of training. - if cfg.policy.get('random_collect_size', 0) > 0: - random_collect(cfg.policy, policy, collector, collector_env, commander, replay_buffer) - dirname = cfg.exp_name + '/reward_model' - if not os.path.exists(dirname): - try: - os.makedirs(dirname) - except FileExistsError: - pass - while True: - collect_kwargs = commander.step() - # Evaluate policy performance - if evaluator.should_eval(learner.train_iter): - stop, reward = evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep) - if stop: - break - # Collect data by default config n_sample/n_episode - new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs) - # NOTE: deepcopy data is very important, - # otherwise the data in the replay buffer will be incorrectly modified. - # NOTE: this line cannot move to line130, because in line134 the data may be modified in-place. - train_data = copy.deepcopy(new_data) - expert_data = expert_collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs) - replay_buffer.push(new_data, cur_collector_envstep=collector.envstep) - expert_buffer.push(expert_data, cur_collector_envstep=expert_collector.envstep) - # Learn policy from collected data - for i in range(cfg.reward_model.update_per_collect): - expert_demo = expert_buffer.sample(cfg.reward_model.batch_size, learner.train_iter) - samp = replay_buffer.sample(cfg.reward_model.batch_size, learner.train_iter) - reward_model.train(expert_demo, samp, learner.train_iter, collector.envstep) - for i in range(cfg.policy.learn.update_per_collect): - # Learner will train ``update_per_collect`` times in one iteration. - _ = reward_model.estimate(train_data) - if train_data is None: - # It is possible that replay buffer's data count is too few to train ``update_per_collect`` times - logging.warning( - "Replay buffer's data can only train for {} steps. ".format(i) + - "You can modify data collect config, e.g. increasing n_sample, n_episode." - ) - break - learner.train(train_data, collector.envstep) - if learner.policy.get_attribute('priority'): - replay_buffer.update(learner.priority_info) - if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter: - break - # save reward model - if learner.train_iter % cfg.reward_model.store_model_every_n_train == 0: - #if learner.train_iter%5000 == 0: - path = os.path.join(dirname, 'iteration_{}.pth.tar'.format(learner.train_iter)) - state_dict = reward_model.state_dict_reward_model() - save_file(path, state_dict) - path = os.path.join(dirname, 'final_model.pth.tar') - state_dict = reward_model.state_dict_reward_model() - save_file(path, state_dict) - # Learner's after_run hook. - learner.call_hook('after_run') - return policy diff --git a/ding/entry/serial_entry_ngu.py b/ding/entry/serial_entry_ngu.py deleted file mode 100644 index 176f5558cd..0000000000 --- a/ding/entry/serial_entry_ngu.py +++ /dev/null @@ -1,171 +0,0 @@ -from typing import Union, Optional, List, Any, Tuple -import os -import torch -from ditk import logging -from functools import partial -from tensorboardX import SummaryWriter -from copy import deepcopy - -from ding.envs import get_vec_env_setting, create_env_manager -from ding.worker import BaseLearner, InteractionSerialEvaluator, BaseSerialCommander, create_buffer, \ - create_serial_collector -from ding.config import read_config, compile_config -from ding.policy import create_policy -from ding.reward_model import create_reward_model -from ding.utils import set_pkg_seed -from .utils import random_collect - - -def serial_pipeline_ngu( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - env_setting: Optional[List[Any]] = None, - model: Optional[torch.nn.Module] = None, - max_train_iter: Optional[int] = int(1e10), - max_env_step: Optional[int] = int(1e10), -) -> 'Policy': # noqa - """ - Overview: - Serial pipeline entry for NGU. The corresponding paper is - `never give up: learning directed exploration strategies`. - Arguments: - - input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \ - ``str`` type means config file path. \ - ``Tuple[dict, dict]`` type means [user_config, create_cfg]. - - seed (:obj:`int`): Random seed. - - env_setting (:obj:`Optional[List[Any]]`): A list with 3 elements: \ - ``BaseEnv`` subclass, collector env config, and evaluator env config. - - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. - - max_train_iter (:obj:`Optional[int]`): Maximum policy update iterations in training. - - max_env_step (:obj:`Optional[int]`): Maximum collected environment interaction steps. - Returns: - - policy (:obj:`Policy`): Converged policy. - """ - if isinstance(input_cfg, str): - cfg, create_cfg = read_config(input_cfg) - else: - cfg, create_cfg = deepcopy(input_cfg) - create_cfg.policy.type = create_cfg.policy.type + '_command' - env_fn = None if env_setting is None else env_setting[0] - cfg = compile_config(cfg, seed=seed, env=env_fn, auto=True, create_cfg=create_cfg, save_cfg=True) - # Create main components: env, policy - if env_setting is None: - env_fn, collector_env_cfg, evaluator_env_cfg = get_vec_env_setting(cfg.env) - else: - env_fn, collector_env_cfg, evaluator_env_cfg = env_setting - collector_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in collector_env_cfg]) - evaluator_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in evaluator_env_cfg]) - # if you want to save replay, please uncomment this line - # evaluator_env.enable_save_replay(cfg.env.replay_path) - - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed, dynamic_seed=False) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - policy = create_policy(cfg.policy, model=model, enable_field=['learn', 'collect', 'eval', 'command']) - - # Create worker components: learner, collector, evaluator, replay buffer, commander. - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial')) - learner = BaseLearner(cfg.policy.learn.learner, policy.learn_mode, tb_logger, exp_name=cfg.exp_name) - collector = create_serial_collector( - cfg.policy.collect.collector, - env=collector_env, - policy=policy.collect_mode, - tb_logger=tb_logger, - exp_name=cfg.exp_name - ) - evaluator = InteractionSerialEvaluator( - cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode, tb_logger, exp_name=cfg.exp_name - ) - replay_buffer = create_buffer(cfg.policy.other.replay_buffer, tb_logger=tb_logger, exp_name=cfg.exp_name) - commander = BaseSerialCommander( - cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode - ) - rnd_reward_model = create_reward_model(cfg.rnd_reward_model, policy.collect_mode.get_attribute('device'), tb_logger) - episodic_reward_model = create_reward_model( - cfg.episodic_reward_model, policy.collect_mode.get_attribute('device'), tb_logger - ) - # ========== - # Main loop - # ========== - # Learner's before_run hook. - learner.call_hook('before_run') - - # Accumulate plenty of data at the beginning of training. - if cfg.policy.get('random_collect_size', 0) > 0: - random_collect(cfg.policy, policy, collector, collector_env, commander, replay_buffer) - - estimate_cnt = 0 - iter_ = 0 - while True: - """some hyper-parameters used in NGU""" - # index_to_eps = {i: 0.4 ** (1 + 8 * i / (self._env_num - 1)) for i in range(self._env_num)} - # index_to_beta = { - # i: 0.3 * torch.sigmoid(torch.tensor(10 * (2 * i - (collector_env_num - 2)) / (collector_env_num - 2))) - # for i in range(collector_env_num) - # } - # index_to_gamma = { - # i: 1 - torch.exp( - # ( - # (collector_env_num - 1 - i) * torch.log(torch.tensor(1 - 0.997)) + - # i * torch.log(torch.tensor(1 - 0.99)) - # ) / (collector_env_num - 1) - # ) - # for i in range(collector_env_num) - # } - iter_ += 1 - - # Evaluate policy performance - if evaluator.should_eval(learner.train_iter): - stop, reward = evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep) - if stop: - break - # Collect data by default config n_sample/n_episode - new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=None) - - # collect data for reward_model training - rnd_reward_model.collect_data(new_data) - episodic_reward_model.collect_data(new_data) - replay_buffer.push(new_data, cur_collector_envstep=collector.envstep) - - # update reward_model - rnd_reward_model.train() - if (iter_ + 1) % cfg.rnd_reward_model.clear_buffer_per_iters == 0: - rnd_reward_model.clear_data() - episodic_reward_model.train() - if (iter_ + 1) % cfg.episodic_reward_model.clear_buffer_per_iters == 0: - episodic_reward_model.clear_data() - - # Learn policy from collected data - for i in range(cfg.policy.learn.update_per_collect): - # Learner will train ``update_per_collect`` times in one iteration. - train_data = replay_buffer.sample(learner.policy.get_attribute('batch_size'), learner.train_iter) - if train_data is None: - # It is possible that replay buffer's data count is too few to train ``update_per_collect`` times - logging.warning( - "Replay buffer's data can only train for {} steps. ".format(i) + - "You can modify data collect config, e.g. increasing n_sample, n_episode." - ) - break - # calculate the inter-episodic and episodic intrinsic reward - rnd_reward = rnd_reward_model.estimate(train_data) - episodic_reward = episodic_reward_model.estimate(train_data) - - # update train_data reward using the augmented reward - train_data_augmented, estimate_cnt = episodic_reward_model.fusion_reward( - train_data, - rnd_reward, - episodic_reward, - nstep=cfg.policy.nstep, - collector_env_num=cfg.policy.collect.env_num, - tb_logger=tb_logger, - estimate_cnt=estimate_cnt - ) - learner.train(train_data_augmented, collector.envstep) - if learner.policy.get_attribute('priority'): - replay_buffer.update(learner.priority_info) - if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter: - break - - # Learner's after_run hook. - learner.call_hook('after_run') - return policy diff --git a/ding/entry/serial_entry_preference_based_irl.py b/ding/entry/serial_entry_preference_based_irl.py deleted file mode 100644 index 682e662baa..0000000000 --- a/ding/entry/serial_entry_preference_based_irl.py +++ /dev/null @@ -1,133 +0,0 @@ -import copy -from typing import Union, Optional, List, Any, Tuple -import os -import torch -from ditk import logging -from functools import partial -from tensorboardX import SummaryWriter -from copy import deepcopy - -from ding.envs import get_vec_env_setting, create_env_manager -from ding.worker import BaseLearner, InteractionSerialEvaluator, BaseSerialCommander, create_buffer, \ - create_serial_collector -from ding.config import read_config, compile_config -from ding.policy import create_policy, PolicyFactory -from ding.reward_model import create_reward_model -from ding.utils import set_pkg_seed - - -def serial_pipeline_preference_based_irl( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - env_setting: Optional[List[Any]] = None, - model: Optional[torch.nn.Module] = None, - max_train_iter: Optional[int] = int(1e10), - max_env_step: Optional[int] = int(1e10), -) -> 'Policy': # noqa - """ - Overview: - serial_pipeline_preference_based_irl. - Arguments: - - input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \ - ``str`` type means config file path. \ - ``Tuple[dict, dict]`` type means [user_config, create_cfg]. - - seed (:obj:`int`): Random seed. - - env_setting (:obj:`Optional[List[Any]]`): A list with 3 elements: \ - ``BaseEnv`` subclass, collector env config, and evaluator env config. - - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. - - max_iterations (:obj:`Optional[torch.nn.Module]`): Learner's max iteration. Pipeline will stop \ - when reaching this iteration. - Returns: - - policy (:obj:`Policy`): Converged policy. - """ - if isinstance(input_cfg, str): - cfg, create_cfg = read_config(input_cfg) - else: - cfg, create_cfg = deepcopy(input_cfg) - create_cfg.policy.type = create_cfg.policy.type + '_command' - create_cfg.reward_model = dict(type=cfg.reward_model.type) - env_fn = None if env_setting is None else env_setting[0] - cfg = compile_config(cfg, seed=seed, env=env_fn, auto=True, create_cfg=create_cfg, save_cfg=True, renew_dir=False) - cfg_bak = copy.deepcopy(cfg) - # Create main components: env, policy - if env_setting is None: - env_fn, collector_env_cfg, evaluator_env_cfg = get_vec_env_setting(cfg.env) - else: - env_fn, collector_env_cfg, evaluator_env_cfg = env_setting - collector_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in collector_env_cfg]) - evaluator_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in evaluator_env_cfg]) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed, dynamic_seed=False) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - policy = create_policy(cfg.policy, model=model, enable_field=['learn', 'collect', 'eval', 'command']) - - # Create worker components: learner, collector, evaluator, replay buffer, commander. - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial')) - learner = BaseLearner(cfg.policy.learn.learner, policy.learn_mode, tb_logger, exp_name=cfg.exp_name) - collector = create_serial_collector( - cfg.policy.collect.collector, - env=collector_env, - policy=policy.collect_mode, - tb_logger=tb_logger, - exp_name=cfg.exp_name - ) - evaluator = InteractionSerialEvaluator( - cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode, tb_logger, exp_name=cfg.exp_name - ) - replay_buffer = create_buffer(cfg.policy.other.replay_buffer, tb_logger=tb_logger, exp_name=cfg.exp_name) - commander = BaseSerialCommander( - cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode - ) - - reward_model = create_reward_model(cfg_bak, policy.collect_mode.get_attribute('device'), tb_logger) - reward_model.train() - # ========== - # Main loop - # ========== - # Learner's before_run hook. - learner.call_hook('before_run') - - # Accumulate plenty of data at the beginning of training. - if cfg.policy.get('random_collect_size', 0) > 0: - if cfg.policy.get('transition_with_policy_data', False): - collector.reset_policy(policy.collect_mode) - else: - action_space = collector_env.env_info().act_space - random_policy = PolicyFactory.get_random_policy(policy.collect_mode, action_space=action_space) - collector.reset_policy(random_policy) - collect_kwargs = commander.step() - new_data = collector.collect(n_sample=cfg.policy.random_collect_size, policy_kwargs=collect_kwargs) - replay_buffer.push(new_data, cur_collector_envstep=0) - collector.reset_policy(policy.collect_mode) - while True: - collect_kwargs = commander.step() - # Evaluate policy performance - if evaluator.should_eval(learner.train_iter): - stop, reward = evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep) - if stop: - break - # Collect data by default config n_sample/n_episode - new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs) - replay_buffer.push(new_data, cur_collector_envstep=collector.envstep) - # Learn policy from collected data - for i in range(cfg.policy.learn.update_per_collect): - # Learner will train ``update_per_collect`` times in one iteration. - train_data = replay_buffer.sample(learner.policy.get_attribute('batch_size'), learner.train_iter) - if train_data is None: - # It is possible that replay buffer's data count is too few to train ``update_per_collect`` times - logging.warning( - "Replay buffer's data can only train for {} steps. ".format(i) + - "You can modify data collect config, e.g. increasing n_sample, n_episode." - ) - break - # update train_data reward using the augmented reward - train_data_augmented = reward_model.estimate(train_data) - learner.train(train_data_augmented, collector.envstep) - if learner.policy.get_attribute('priority'): - replay_buffer.update(learner.priority_info) - if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter: - break - - # Learner's after_run hook. - learner.call_hook('after_run') - return policy diff --git a/ding/entry/serial_entry_preference_based_irl_onpolicy.py b/ding/entry/serial_entry_preference_based_irl_onpolicy.py deleted file mode 100644 index 3941f3337e..0000000000 --- a/ding/entry/serial_entry_preference_based_irl_onpolicy.py +++ /dev/null @@ -1,104 +0,0 @@ -from typing import Union, Optional, List, Any, Tuple -import os -import torch -from ditk import logging -from functools import partial -from tensorboardX import SummaryWriter -from copy import deepcopy - -from ding.envs import get_vec_env_setting, create_env_manager -from ding.worker import BaseLearner, InteractionSerialEvaluator, BaseSerialCommander, create_buffer, \ - create_serial_collector -from ding.config import read_config, compile_config -from ding.policy import create_policy, PolicyFactory -from ding.reward_model import create_reward_model -from ding.utils import set_pkg_seed - - -def serial_pipeline_preference_based_irl_onpolicy( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - env_setting: Optional[List[Any]] = None, - model: Optional[torch.nn.Module] = None, - max_train_iter: Optional[int] = int(1e10), - max_env_step: Optional[int] = int(1e10), -) -> 'Policy': # noqa - """ - Overview: - Serial pipeline entry for preference based irl of on-policy algorithm(such as PPO). - Arguments: - - input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \ - ``str`` type means config file path. \ - ``Tuple[dict, dict]`` type means [user_config, create_cfg]. - - seed (:obj:`int`): Random seed. - - env_setting (:obj:`Optional[List[Any]]`): A list with 3 elements: \ - ``BaseEnv`` subclass, collector env config, and evaluator env config. - - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. - - max_train_iter (:obj:`Optional[int]`): Maximum policy update iterations in training. - - max_env_step (:obj:`Optional[int]`): Maximum collected environment interaction steps. - Returns: - - policy (:obj:`Policy`): Converged policy. - """ - if isinstance(input_cfg, str): - cfg, create_cfg = read_config(input_cfg) - else: - cfg, create_cfg = deepcopy(input_cfg) - create_cfg.policy.type = create_cfg.policy.type + '_command' - create_cfg.reward_model = dict(type=cfg.reward_model.type) - env_fn = None if env_setting is None else env_setting[0] - cfg = compile_config(cfg, seed=seed, env=env_fn, auto=True, create_cfg=create_cfg, save_cfg=True, renew_dir=False) - # Create main components: env, policy - if env_setting is None: - env_fn, collector_env_cfg, evaluator_env_cfg = get_vec_env_setting(cfg.env) - else: - env_fn, collector_env_cfg, evaluator_env_cfg = env_setting - collector_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in collector_env_cfg]) - evaluator_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in evaluator_env_cfg]) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed, dynamic_seed=False) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - policy = create_policy(cfg.policy, model=model, enable_field=['learn', 'collect', 'eval', 'command']) - - # Create worker components: learner, collector, evaluator, replay buffer, commander. - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial')) - learner = BaseLearner(cfg.policy.learn.learner, policy.learn_mode, tb_logger, exp_name=cfg.exp_name) - collector = create_serial_collector( - cfg.policy.collect.collector, - env=collector_env, - policy=policy.collect_mode, - tb_logger=tb_logger, - exp_name=cfg.exp_name - ) - evaluator = InteractionSerialEvaluator( - cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode, tb_logger, exp_name=cfg.exp_name - ) - commander = BaseSerialCommander( - cfg.policy.other.commander, learner, collector, evaluator, None, policy.command_mode - ) - reward_model = create_reward_model(cfg, policy.collect_mode.get_attribute('device'), tb_logger) - reward_model.train() - # ========== - # Main loop - # ========== - # Learner's before_run hook. - learner.call_hook('before_run') - - while True: - collect_kwargs = commander.step() - # Evaluate policy performance - if evaluator.should_eval(learner.train_iter): - stop, reward = evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep) - if stop: - break - # Collect data by default config n_sample/n_episode - new_data = collector.collect(train_iter=learner.train_iter) - train_data = new_data - # update train_data reward using the augmented reward - train_data_augmented = reward_model.estimate(train_data) - learner.train(train_data_augmented, collector.envstep) - if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter: - break - - # Learner's after_run hook. - learner.call_hook('after_run') - return policy diff --git a/ding/entry/serial_entry_reward_model_offpolicy.py b/ding/entry/serial_entry_reward_model_offpolicy.py index bcfd0f882e..b4a4b84bc8 100644 --- a/ding/entry/serial_entry_reward_model_offpolicy.py +++ b/ding/entry/serial_entry_reward_model_offpolicy.py @@ -18,12 +18,14 @@ def serial_pipeline_reward_model_offpolicy( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - env_setting: Optional[List[Any]] = None, - model: Optional[torch.nn.Module] = None, - max_train_iter: Optional[int] = int(1e10), - max_env_step: Optional[int] = int(1e10), + input_cfg: Union[str, Tuple[dict, dict]], + seed: int = 0, + env_setting: Optional[List[Any]] = None, + model: Optional[torch.nn.Module] = None, + max_train_iter: Optional[int] = int(1e10), + max_env_step: Optional[int] = int(1e10), + cooptrain_reward_model: Optional[bool] = True, + pretrain_reward_model: Optional[bool] = False, ) -> 'Policy': # noqa """ Overview: @@ -38,6 +40,8 @@ def serial_pipeline_reward_model_offpolicy( - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. - max_train_iter (:obj:`Optional[int]`): Maximum policy update iterations in training. - max_env_step (:obj:`Optional[int]`): Maximum collected environment interaction steps. + - cooptrain_reward_model (:obj:`Optional[bool]`): Whether train reward model during policy training. + - pretrain_reward_model (:obj:`Optional[bool]`): Whether train reward model before policy training. Returns: - policy (:obj:`Policy`): Converged policy. """ @@ -78,6 +82,8 @@ def serial_pipeline_reward_model_offpolicy( cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode ) reward_model = create_reward_model(cfg.reward_model, policy.collect_mode.get_attribute('device'), tb_logger) + if pretrain_reward_model: + reward_model.train() # ========== # Main loop @@ -88,7 +94,6 @@ def serial_pipeline_reward_model_offpolicy( # Accumulate plenty of data at the beginning of training. if cfg.policy.get('random_collect_size', 0) > 0: random_collect(cfg.policy, policy, collector, collector_env, commander, replay_buffer) - count = 0 best_reward = -np.inf while True: collect_kwargs = commander.step() @@ -105,14 +110,15 @@ def serial_pipeline_reward_model_offpolicy( while new_data_count < target_new_data_count: new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs) new_data_count += len(new_data) - # collect data for reward_model training - reward_model.collect_data(new_data) replay_buffer.push(new_data, cur_collector_envstep=collector.envstep) - # update reward_model - reward_model.train() - # clear buffer per fix iters to make sure replay buffer's data count isn't too few. - if count % cfg.reward_model.clear_buffer_per_iters == 0: - reward_model.clear_data() + if cooptrain_reward_model: + # collect data for reward_model training + reward_model.collect_data(new_data) + # update reward_model, when you want to train reward_model inloop + if cooptrain_reward_model: + reward_model.train() + # clear buffer per fixed iters to make sure the data for RM training is not too offpolicy. + reward_model.clear_data(iter=learner.train_iter) # Learn policy from collected data for i in range(cfg.policy.learn.update_per_collect): # Learner will train ``update_per_collect`` times in one iteration. @@ -131,7 +137,6 @@ def serial_pipeline_reward_model_offpolicy( replay_buffer.update(learner.priority_info) if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter: break - count += 1 # Learner's after_run hook. learner.call_hook('after_run') diff --git a/ding/entry/serial_entry_reward_model_onpolicy.py b/ding/entry/serial_entry_reward_model_onpolicy.py index a30e5d8ef9..f1a3c388ab 100644 --- a/ding/entry/serial_entry_reward_model_onpolicy.py +++ b/ding/entry/serial_entry_reward_model_onpolicy.py @@ -18,12 +18,14 @@ def serial_pipeline_reward_model_onpolicy( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - env_setting: Optional[List[Any]] = None, - model: Optional[torch.nn.Module] = None, - max_train_iter: Optional[int] = int(1e10), - max_env_step: Optional[int] = int(1e10), + input_cfg: Union[str, Tuple[dict, dict]], + seed: int = 0, + env_setting: Optional[List[Any]] = None, + model: Optional[torch.nn.Module] = None, + max_train_iter: Optional[int] = int(1e10), + max_env_step: Optional[int] = int(1e10), + cooptrain_reward_model: Optional[bool] = True, + pretrain_reward_model: Optional[bool] = False, ) -> 'Policy': # noqa """ Overview: @@ -38,6 +40,8 @@ def serial_pipeline_reward_model_onpolicy( - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. - max_train_iter (:obj:`Optional[int]`): Maximum policy update iterations in training. - max_env_step (:obj:`Optional[int]`): Maximum collected environment interaction steps. + - cooptrain_reward_model (:obj:`Optional[bool]`): Whether train reward model during policy training. + - pretrain_reward_model (:obj:`Optional[bool]`): Whether train reward model before policy training. Returns: - policy (:obj:`Policy`): Converged policy. """ @@ -78,7 +82,8 @@ def serial_pipeline_reward_model_onpolicy( cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode ) reward_model = create_reward_model(cfg.reward_model, policy.collect_mode.get_attribute('device'), tb_logger) - + if pretrain_reward_model: + reward_model.train() # ========== # Main loop # ========== @@ -88,7 +93,6 @@ def serial_pipeline_reward_model_onpolicy( # Accumulate plenty of data at the beginning of training. if cfg.policy.get('random_collect_size', 0) > 0: random_collect(cfg.policy, policy, collector, collector_env, commander, replay_buffer) - count = 0 best_reward = -np.inf while True: collect_kwargs = commander.step() @@ -106,11 +110,12 @@ def serial_pipeline_reward_model_onpolicy( new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs) new_data_count += len(new_data) # collect data for reward_model training - reward_model.collect_data(new_data) + if cooptrain_reward_model: + reward_model.collect_data(new_data) # update reward_model - reward_model.train() - if count % cfg.reward_model.clear_buffer_per_iters == 0: - reward_model.clear_data() + if cooptrain_reward_model: + reward_model.train() + reward_model.clear_data(iter=learner.train_iter) # Learn policy from collected data for i in range(cfg.policy.learn.update_per_collect): # Learner will train ``update_per_collect`` times in one iteration. @@ -129,7 +134,6 @@ def serial_pipeline_reward_model_onpolicy( replay_buffer.update(learner.priority_info) if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter: break - count += 1 # Learner's after_run hook. learner.call_hook('after_run') diff --git a/ding/entry/tests/test_serial_entry_guided_cost.py b/ding/entry/tests/test_serial_entry_guided_cost.py deleted file mode 100644 index 33742d4fb8..0000000000 --- a/ding/entry/tests/test_serial_entry_guided_cost.py +++ /dev/null @@ -1,23 +0,0 @@ -import pytest -import torch -from copy import deepcopy -from ding.entry import serial_pipeline_onpolicy, serial_pipeline_guided_cost -from dizoo.classic_control.cartpole.config import cartpole_ppo_config, cartpole_ppo_create_config -from dizoo.classic_control.cartpole.config import cartpole_gcl_ppo_onpolicy_config, \ - cartpole_gcl_ppo_onpolicy_create_config - - -@pytest.mark.unittest -def test_guided_cost(): - expert_policy_state_dict_path = './expert_policy.pth' - config = [deepcopy(cartpole_ppo_config), deepcopy(cartpole_ppo_create_config)] - expert_policy = serial_pipeline_onpolicy(config, seed=0) - torch.save(expert_policy.collect_mode.state_dict(), expert_policy_state_dict_path) - - config = [deepcopy(cartpole_gcl_ppo_onpolicy_config), deepcopy(cartpole_gcl_ppo_onpolicy_create_config)] - config[0].policy.collect.model_path = expert_policy_state_dict_path - config[0].policy.learn.update_per_collect = 1 - try: - serial_pipeline_guided_cost(config, seed=0, max_train_iter=1) - except Exception: - assert False, "pipeline fail" diff --git a/ding/entry/tests/test_serial_entry_preference_based_irl.py b/ding/entry/tests/test_serial_entry_preference_based_irl.py deleted file mode 100644 index 7e9198f929..0000000000 --- a/ding/entry/tests/test_serial_entry_preference_based_irl.py +++ /dev/null @@ -1,62 +0,0 @@ -import pytest -from copy import deepcopy -import os -from easydict import EasyDict - -import torch - -from ding.entry import serial_pipeline -from ding.entry import serial_pipeline_preference_based_irl -from dizoo.classic_control.cartpole.config.cartpole_trex_offppo_config import cartpole_trex_offppo_config,\ - cartpole_trex_offppo_create_config -from dizoo.classic_control.cartpole.config.cartpole_ppo_offpolicy_config import cartpole_ppo_offpolicy_config,\ - cartpole_ppo_offpolicy_create_config -from ding.entry.application_entry_trex_collect_data import trex_collecting_data -from ding.reward_model.trex_reward_model import TrexConvEncoder -from ding.torch_utils import is_differentiable - - -@pytest.mark.unittest -def test_serial_pipeline_trex(): - exp_name = 'test_serial_pipeline_trex_expert' - config = [deepcopy(cartpole_ppo_offpolicy_config), deepcopy(cartpole_ppo_offpolicy_create_config)] - config[0].policy.learn.learner.hook.save_ckpt_after_iter = 100 - config[0].exp_name = exp_name - expert_policy = serial_pipeline(config, seed=0) - - exp_name = 'test_serial_pipeline_trex_collect' - config = [deepcopy(cartpole_trex_offppo_config), deepcopy(cartpole_trex_offppo_create_config)] - config[0].exp_name = exp_name - config[0].reward_model.expert_model_path = 'test_serial_pipeline_trex_expert' - config[0].reward_model.checkpoint_max = 100 - config[0].reward_model.checkpoint_step = 100 - config[0].reward_model.num_snippets = 100 - args = EasyDict({'cfg': deepcopy(config), 'seed': 0, 'device': 'cpu'}) - trex_collecting_data(args=args) - try: - serial_pipeline_preference_based_irl(config, seed=0, max_train_iter=1) - except Exception: - assert False, "pipeline fail" - finally: - os.popen('rm -rf test_serial_pipeline_trex*') - - -B = 4 -C, H, W = 3, 128, 128 - - -@pytest.mark.unittest -class TestEncoder: - - def output_check(self, model, outputs): - loss = outputs.sum() - is_differentiable(loss, model) - - def test_conv_encoder(self): - inputs = torch.randn(B, C, H, W) - model = TrexConvEncoder((C, H, W)) - print(model) - outputs = model(inputs) - self.output_check(model, outputs) - print(outputs.shape) - assert outputs.shape == (B, 1) diff --git a/ding/entry/tests/test_serial_entry_preference_based_irl_onpolicy.py b/ding/entry/tests/test_serial_entry_preference_based_irl_onpolicy.py deleted file mode 100644 index ff0e88b0d5..0000000000 --- a/ding/entry/tests/test_serial_entry_preference_based_irl_onpolicy.py +++ /dev/null @@ -1,38 +0,0 @@ -import pytest -from copy import deepcopy -import os -from easydict import EasyDict - -import torch - -from ding.entry import serial_pipeline_onpolicy -from ding.entry import serial_pipeline_preference_based_irl_onpolicy -from dizoo.classic_control.cartpole.config import cartpole_ppo_config, cartpole_ppo_create_config -from dizoo.classic_control.cartpole.config import cartpole_trex_ppo_onpolicy_config, \ - cartpole_trex_ppo_onpolicy_create_config -from ding.entry.application_entry_trex_collect_data import trex_collecting_data - - -@pytest.mark.unittest -def test_serial_pipeline_trex_onpolicy(): - exp_name = 'test_serial_pipeline_trex_onpolicy_expert' - config = [deepcopy(cartpole_ppo_config), deepcopy(cartpole_ppo_create_config)] - config[0].policy.learn.learner.hook.save_ckpt_after_iter = 100 - config[0].exp_name = exp_name - expert_policy = serial_pipeline_onpolicy(config, seed=0) - - exp_name = 'test_serial_pipeline_trex_onpolicy_collect' - config = [deepcopy(cartpole_trex_ppo_onpolicy_config), deepcopy(cartpole_trex_ppo_onpolicy_create_config)] - config[0].exp_name = exp_name - config[0].reward_model.expert_model_path = 'test_serial_pipeline_trex_onpolicy_expert' - config[0].reward_model.checkpoint_max = 100 - config[0].reward_model.checkpoint_step = 100 - config[0].reward_model.num_snippets = 100 - args = EasyDict({'cfg': deepcopy(config), 'seed': 0, 'device': 'cpu'}) - trex_collecting_data(args=args) - try: - serial_pipeline_preference_based_irl_onpolicy(config, seed=0, max_train_iter=1) - except Exception: - assert False, "pipeline fail" - finally: - os.popen('rm -rf test_serial_pipeline_trex_onpolicy*') diff --git a/ding/entry/tests/test_serial_entry_reward_model.py b/ding/entry/tests/test_serial_entry_reward_model.py index 404cb6d78c..b971cfe137 100644 --- a/ding/entry/tests/test_serial_entry_reward_model.py +++ b/ding/entry/tests/test_serial_entry_reward_model.py @@ -1,15 +1,21 @@ import pytest import os -from ditk import logging from easydict import EasyDict from copy import deepcopy from dizoo.classic_control.cartpole.config.cartpole_dqn_config import cartpole_dqn_config, cartpole_dqn_create_config +from dizoo.classic_control.cartpole.config.cartpole_trex_offppo_config import cartpole_trex_offppo_config,\ + cartpole_trex_offppo_create_config +from dizoo.classic_control.cartpole.config.cartpole_drex_dqn_config import cartpole_drex_dqn_config, \ + cartpole_drex_dqn_create_config from dizoo.classic_control.cartpole.config.cartpole_ppo_offpolicy_config import cartpole_ppo_offpolicy_config, cartpole_ppo_offpolicy_create_config # noqa from dizoo.classic_control.cartpole.config.cartpole_rnd_onppo_config import cartpole_ppo_rnd_config, cartpole_ppo_rnd_create_config # noqa from dizoo.classic_control.cartpole.config.cartpole_ppo_icm_config import cartpole_ppo_icm_config, cartpole_ppo_icm_create_config # noqa +from dizoo.classic_control.cartpole.config.cartpole_ngu_config import cartpole_ngu_config, cartpole_ngu_create_config from ding.entry import serial_pipeline, collect_demo_data, serial_pipeline_reward_model_offpolicy, \ serial_pipeline_reward_model_onpolicy +from ding.entry.application_entry_trex_collect_data import trex_collecting_data +from ding.entry.application_entry_drex_collect_data import drex_collecting_data cfg = [ { @@ -20,7 +26,7 @@ { 'type': 'gail', 'input_size': 5, - 'hidden_size': 64, + 'hidden_size_list': [64], 'batch_size': 64, }, { @@ -32,11 +38,21 @@ { 'type': 'red', 'sample_size': 5000, - 'input_size': 5, - 'hidden_size': 64, + 'obs_shape': 4, + 'action_shape': 1, + 'hidden_size_list': [64, 1], 'update_per_collect': 200, 'batch_size': 128, }, + { + 'type': 'guided_cost', + 'learning_rate': 0.001, + 'input_size': 5, + 'batch_size': 32, + 'continuous': False, + 'update_per_collect': 10, + 'collect_count': 1000, + }, ] @@ -55,8 +71,12 @@ def test_irl(reward_model_config): config, seed=0, state_dict=state_dict, expert_data_path=expert_data_path, collect_count=collect_count ) # irl + rl training - cp_cartpole_dqn_config = deepcopy(cartpole_dqn_config) - cp_cartpole_dqn_create_config = deepcopy(cartpole_dqn_create_config) + if reward_model_config.type == 'guided_cost': + cp_cartpole_dqn_config = deepcopy(cartpole_ppo_offpolicy_config) + cp_cartpole_dqn_create_config = deepcopy(cartpole_ppo_offpolicy_create_config) + else: + cp_cartpole_dqn_config = deepcopy(cartpole_dqn_config) + cp_cartpole_dqn_create_config = deepcopy(cartpole_dqn_create_config) cp_cartpole_dqn_create_config.reward_model = dict(type=reward_model_config.type) if reward_model_config.type == 'gail': reward_model_config['data_path'] = '.' @@ -64,10 +84,15 @@ def test_irl(reward_model_config): reward_model_config['expert_data_path'] = expert_data_path cp_cartpole_dqn_config.reward_model = reward_model_config cp_cartpole_dqn_config.policy.collect.n_sample = 128 + cooptrain_reward_model = True + pretrain_reward_model = False serial_pipeline_reward_model_offpolicy( - (cp_cartpole_dqn_config, cp_cartpole_dqn_create_config), seed=0, max_train_iter=2 + (cp_cartpole_dqn_config, cp_cartpole_dqn_create_config), + seed=0, + max_train_iter=2, + pretrain_reward_model=pretrain_reward_model, + cooptrain_reward_model=cooptrain_reward_model ) - os.popen("rm -rf ckpt_* log expert_data.pkl") @@ -78,6 +103,8 @@ def test_rnd(): serial_pipeline_reward_model_onpolicy(config, seed=0, max_train_iter=2) except Exception: assert False, "pipeline fail" + finally: + os.popen("rm -rf cartpole_ppo_rnd*") @pytest.mark.unittest @@ -87,3 +114,79 @@ def test_icm(): serial_pipeline_reward_model_offpolicy(config, seed=0, max_train_iter=2) except Exception: assert False, "pipeline fail" + finally: + os.popen("rm -rf cartpole_ppo_icm*") + + +@pytest.mark.unittest +def test_ngu(): + config = [deepcopy(cartpole_ngu_config), deepcopy(cartpole_ngu_create_config)] + try: + serial_pipeline_reward_model_offpolicy(config, seed=0, max_train_iter=2) + except Exception: + assert False, "pipeline fail" + finally: + os.popen("rm -rf cartpole_ngu*") + + +@pytest.mark.unittest +def test_trex(): + exp_name = 'test_serial_pipeline_trex_expert' + config = [deepcopy(cartpole_ppo_offpolicy_config), deepcopy(cartpole_ppo_offpolicy_create_config)] + config[0].policy.learn.learner.hook.save_ckpt_after_iter = 100 + config[0].exp_name = exp_name + expert_policy = serial_pipeline(config, seed=0) + + exp_name = 'test_serial_pipeline_trex_collect' + config = [deepcopy(cartpole_trex_offppo_config), deepcopy(cartpole_trex_offppo_create_config)] + config[0].exp_name = exp_name + config[0].reward_model.exp_name = exp_name + config[0].reward_model.expert_model_path = 'test_serial_pipeline_trex_expert' + config[0].reward_model.checkpoint_max = 100 + config[0].reward_model.checkpoint_step = 100 + config[0].reward_model.num_snippets = 100 + args = EasyDict({'cfg': deepcopy(config), 'seed': 0, 'device': 'cpu'}) + trex_collecting_data(args=args) + try: + serial_pipeline_reward_model_offpolicy( + config, seed=0, max_train_iter=1, pretrain_reward_model=True, cooptrain_reward_model=False + ) + except Exception: + assert False, "pipeline fail" + finally: + os.popen('rm -rf test_serial_pipeline_trex*') + + +@pytest.mark.unittest +def test_drex(): + exp_name = 'test_serial_pipeline_drex_expert' + config = [deepcopy(cartpole_dqn_config), deepcopy(cartpole_dqn_create_config)] + config[0].policy.learn.learner.hook.save_ckpt_after_iter = 100 + config[0].exp_name = exp_name + expert_policy = serial_pipeline(config, seed=0) + + exp_name = 'test_serial_pipeline_drex_collect' + config = [deepcopy(cartpole_drex_dqn_config), deepcopy(cartpole_drex_dqn_create_config)] + config[0].exp_name = exp_name + config[0].reward_model.exp_name = exp_name + config[0].reward_model.expert_model_path = 'test_serial_pipeline_drex_expert/ckpt/ckpt_best.pth.tar' + config[0].reward_model.reward_model_path = 'test_serial_pipeline_drex_collect/cartpole.params' + config[0].reward_model.offline_data_path = 'test_serial_pipeline_drex_collect' + config[0].reward_model.checkpoint_max = 100 + config[0].reward_model.checkpoint_step = 100 + config[0].reward_model.num_snippets = 100 + + args = EasyDict({'cfg': deepcopy(config), 'seed': 0, 'device': 'cpu'}) + args.cfg[0].policy.collect.n_episode = 8 + del args.cfg[0].policy.collect.n_sample + args.cfg[0].bc_iteration = 1000 # for unittest + args.cfg[1].policy.type = 'bc' + drex_collecting_data(args=args) + try: + serial_pipeline_reward_model_offpolicy( + config, seed=0, max_train_iter=1, pretrain_reward_model=True, cooptrain_reward_model=False + ) + except Exception: + assert False, "pipeline fail" + finally: + os.popen('rm -rf test_serial_pipeline_drex*') diff --git a/ding/policy/ngu.py b/ding/policy/ngu.py index 95fe2dd82a..8c2ef6f48b 100644 --- a/ding/policy/ngu.py +++ b/ding/policy/ngu.py @@ -431,7 +431,7 @@ def _init_collect(self) -> None: # epsilon=0.4, alpha=9 self.eps = {i: 0.4 ** (1 + 8 * i / (self._cfg.collect.env_num - 1)) for i in range(self._cfg.collect.env_num)} - def _forward_collect(self, data: dict) -> dict: + def _forward_collect(self, data: dict, eps: float) -> dict: r""" Overview: Collect output according to eps_greedy plugin diff --git a/ding/reward_model/__init__.py b/ding/reward_model/__init__.py index 4538102861..47b5152b59 100644 --- a/ding/reward_model/__init__.py +++ b/ding/reward_model/__init__.py @@ -11,5 +11,7 @@ # exploration from .rnd_reward_model import RndRewardModel from .guided_cost_reward_model import GuidedCostRewardModel -from .ngu_reward_model import RndNGURewardModel, EpisodicNGURewardModel +from .ngu_reward_model import RndNGURewardModel, EpisodicNGURewardModel, NGURewardModel from .icm_reward_model import ICMRewardModel +from .network import RepresentationNetwork, RNDNetwork, REDNetwork, GAILNetwork, ICMNetwork, GCLNetwork, TREXNetwork +from .reward_model_utils import concat_state_action_pairs, combine_intrinsic_exterinsic_reward, obs_norm, collect_states diff --git a/ding/reward_model/base_reward_model.py b/ding/reward_model/base_reward_model.py index 963bacf1d7..ac52304d0b 100644 --- a/ding/reward_model/base_reward_model.py +++ b/ding/reward_model/base_reward_model.py @@ -60,10 +60,13 @@ def collect_data(self, data) -> None: raise NotImplementedError() @abstractmethod - def clear_data(self) -> None: + def clear_data(self, iter: int) -> None: """ Overview: - Clearing training data. \ + Clearing training data. + Arguments: + - iter (:obj:`int`): Current training iteration + Returns / Effects: This can be a side effect function which clears the data attribute in ``self`` """ raise NotImplementedError() diff --git a/ding/reward_model/drex_reward_model.py b/ding/reward_model/drex_reward_model.py index 645b469088..ea11b53f02 100644 --- a/ding/reward_model/drex_reward_model.py +++ b/ding/reward_model/drex_reward_model.py @@ -1,6 +1,7 @@ import copy from easydict import EasyDict import pickle +import numpy as np from ding.utils import REWARD_MODEL_REGISTRY @@ -66,6 +67,7 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> self.demo_data = [] self.load_expert_data() + self._logger.info("device: {}".format(device)) def load_expert_data(self) -> None: """ @@ -77,11 +79,23 @@ def load_expert_data(self) -> None: """ super(DrexRewardModel, self).load_expert_data() - with open(self.cfg.reward_model.offline_data_path + '/suboptimal_data.pkl', 'rb') as f: + with open(self.cfg.offline_data_path + '/suboptimal_data.pkl', 'rb') as f: self.demo_data = pickle.load(f) def train(self): - self._train() + + training_inputs, training_outputs = self.training_obs, self.training_labels + + cum_loss = 0.0 + training_data = list(zip(training_inputs, training_outputs)) + for epoch in range(self.cfg.update_per_collect): + np.random.shuffle(training_data) + training_obs, training_labels = zip(*training_data) + cum_loss = self._train(training_obs, training_labels) + self.train_iter += 1 + self._logger.info("[epoch {}] loss {}".format(epoch, cum_loss)) + self.tb_logger.add_scalar("drex_reward/train_loss_iteration", cum_loss, self.train_iter) + return_dict = self.pred_data(self.demo_data) res, pred_returns = return_dict['real'], return_dict['pred'] self._logger.info("real: " + str(res)) diff --git a/ding/reward_model/gail_irl_model.py b/ding/reward_model/gail_irl_model.py index 6533e114dd..b82e2d17e6 100644 --- a/ding/reward_model/gail_irl_model.py +++ b/ding/reward_model/gail_irl_model.py @@ -1,7 +1,6 @@ from typing import List, Dict, Any import pickle import random -from collections.abc import Iterable from easydict import EasyDict import torch @@ -10,99 +9,11 @@ from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel -import torch.nn.functional as F +from .reward_model_utils import concat_state_action_pairs +from .network import GAILNetwork from functools import partial -def concat_state_action_pairs(iterator): - """ - Overview: - Concatenate state and action pairs from input. - Arguments: - - iterator (:obj:`Iterable`): Iterables with at least ``obs`` and ``action`` tensor keys. - Returns: - - res (:obj:`Torch.tensor`): State and action pairs. - """ - assert isinstance(iterator, Iterable) - res = [] - for item in iterator: - state = item['obs'].flatten() # to allow 3d obs and actions concatenation - action = item['action'] - s_a = torch.cat([state, action.float()], dim=-1) - res.append(s_a) - return res - - -def concat_state_action_pairs_one_hot(iterator, action_size: int): - """ - Overview: - Concatenate state and action pairs from input. Action values are one-hot encoded - Arguments: - - iterator (:obj:`Iterable`): Iterables with at least ``obs`` and ``action`` tensor keys. - Returns: - - res (:obj:`Torch.tensor`): State and action pairs. - """ - assert isinstance(iterator, Iterable) - res = [] - for item in iterator: - state = item['obs'].flatten() # to allow 3d obs and actions concatenation - action = item['action'] - action = torch.Tensor([int(i == action) for i in range(action_size)]) - s_a = torch.cat([state, action], dim=-1) - res.append(s_a) - return res - - -class RewardModelNetwork(nn.Module): - - def __init__(self, input_size: int, hidden_size: int, output_size: int) -> None: - super(RewardModelNetwork, self).__init__() - self.l1 = nn.Linear(input_size, hidden_size) - self.l2 = nn.Linear(hidden_size, output_size) - self.a1 = nn.Tanh() - self.a2 = nn.Sigmoid() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - out = x - out = self.l1(out) - out = self.a1(out) - out = self.l2(out) - out = self.a2(out) - return out - - -class AtariRewardModelNetwork(nn.Module): - - def __init__(self, input_size: int, action_size: int) -> None: - super(AtariRewardModelNetwork, self).__init__() - self.input_size = input_size - self.action_size = action_size - self.conv1 = nn.Conv2d(4, 16, 7, stride=3) - self.conv2 = nn.Conv2d(16, 16, 5, stride=2) - self.conv3 = nn.Conv2d(16, 16, 3, stride=1) - self.conv4 = nn.Conv2d(16, 16, 3, stride=1) - self.fc1 = nn.Linear(784, 64) - self.fc2 = nn.Linear(64 + self.action_size, 1) # here we add 1 to take consideration of the action concat - self.a = nn.Sigmoid() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - # input: x = [B, 4 x 84 x 84 + self.action_size], last element is action - actions = x[:, -self.action_size:] # [B, self.action_size] - # get observations - x = x[:, :-self.action_size] - x = x.reshape([-1] + self.input_size) # [B, 4, 84, 84] - x = F.leaky_relu(self.conv1(x)) - x = F.leaky_relu(self.conv2(x)) - x = F.leaky_relu(self.conv3(x)) - x = F.leaky_relu(self.conv4(x)) - x = x.reshape(-1, 784) - x = F.leaky_relu(self.fc1(x)) - x = torch.cat([x, actions], dim=-1) - x = self.fc2(x) - r = self.a(x) - return r - - @REWARD_MODEL_REGISTRY.register('gail') class GailRewardModel(BaseRewardModel): """ @@ -127,10 +38,15 @@ class GailRewardModel(BaseRewardModel): | | obs_dim + act_dim | 7 | ``target_new_`` int 64 | Collect steps per iteration | | ``data_count`` | | - 8 | ``hidden_size`` int 128 | Linear model hidden size | - 9 | ``collect_count`` int 100000 | Expert dataset size | One entry is a (s,a) + 8 | ``hidden_size`` list( [32,32,64] | Sequence of ``hidden_size`` | + | ``_list`` int) | of reward network. | + 9 | ``kernel_size`` list( [5,3] | kernel size list | only used in image + | int) | | input + 10 | ``stride`` list( | stride size list | only used in image + | int) | | input + 11 | ``collect_count`` int 100000 | Expert dataset size | One entry is a (s,a) | | | tuple - 10 | ``clear_buffer_`` int 1 | clear buffer per fixed iters | make sure replay + 12 | ``clear_buffer_`` int 1 | clear buffer per fixed iters | make sure replay | ``per_iters`` | buffer's data count | | isn't too few. | | (code work in entry) @@ -147,12 +63,19 @@ class GailRewardModel(BaseRewardModel): update_per_collect=100, # (int) How many samples in a training batch. batch_size=64, - # (int) Size of the input: obs_dim + act_dim. + # (int) Size of the input: obs_dim. input_size=4, + # Size of the input: act_dim. + # action_size=6, # (int) Collect steps per iteration. target_new_data_count=64, - # (int) Linear model hidden size. - hidden_size=128, + # (list(int)) Sequence of ``hidden_size`` of reward network. + # if the input is vector, hidden_Size_list = [hidden_size] + hidden_size_list=[16, 16, 16, 16, 64], + # (list(int)) kernel size list, used for image input, size should be len(hidden_size_list) - 1 + kernel_size=[7, 5, 3, 3], + # (list(int)) stride size list, used for image input, size should be len(hidden_size_list) - 1 + stride=[3, 2, 2, 1], # (int) Expert dataset size. collect_count=100000, # (int) Clear buffer per fixed iters. @@ -175,12 +98,19 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> self.tb_logger = tb_logger obs_shape = config.input_size if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.reward_model = RewardModelNetwork(config.input_size, config.hidden_size, 1) + self.reward_model = GAILNetwork(obs_shape, config.hidden_size_list, nn.Tanh()) self.concat_state_action_pairs = concat_state_action_pairs elif len(obs_shape) == 3: action_shape = self.cfg.action_size - self.reward_model = AtariRewardModelNetwork(config.input_size, action_shape) - self.concat_state_action_pairs = partial(concat_state_action_pairs_one_hot, action_size=action_shape) + self.reward_model = GAILNetwork( + obs_shape, + config.hidden_size_list, + config.kernel_size, + config.stride, + nn.LeakyReLU(), + action_shape=action_shape + ) + self.concat_state_action_pairs = partial(concat_state_action_pairs, action_size=action_shape, one_hot_=True) self.reward_model.to(self.device) self.expert_data = [] self.train_data = [] @@ -201,6 +131,7 @@ def load_expert_data(self) -> None: with open(self.cfg.data_path + '/expert_data.pkl', 'rb') as f: self.expert_data_loader: list = pickle.load(f) self.expert_data = self.concat_state_action_pairs(self.expert_data_loader) + self.expert_data = torch.unbind(self.expert_data, dim=0) def state_dict(self) -> Dict[str, Any]: return { @@ -210,23 +141,20 @@ def state_dict(self) -> Dict[str, Any]: def load_state_dict(self, state_dict: Dict[str, Any]) -> None: self.reward_model.load_state_dict(state_dict['model']) - def learn(self, train_data: torch.Tensor, expert_data: torch.Tensor) -> float: + def _train(self) -> float: """ Overview: - Helper function for ``train`` which calculates loss for train data and expert data. - Arguments: - - train_data (:obj:`torch.Tensor`): Data used for training - - expert_data (:obj:`torch.Tensor`): Expert data + Helper function for ``train`` which caclulates loss for train data and expert data. Returns: - - Combined loss calculated of reward model from using ``train_data`` and ``expert_data``. + - Combined loss calculated of reward model from using ``states_actions_tensor``. """ - # calculate loss, here are some hyper-param - out_1: torch.Tensor = self.reward_model(train_data) - loss_1: torch.Tensor = torch.log(out_1 + 1e-8).mean() - out_2: torch.Tensor = self.reward_model(expert_data) - loss_2: torch.Tensor = torch.log(1 - out_2 + 1e-8).mean() - # log(x) with 0 None: - This is a side effect function which updates the reward model and increment the train iteration count. """ for _ in range(self.cfg.update_per_collect): - sample_expert_data: list = random.sample(self.expert_data, self.cfg.batch_size) - sample_train_data: list = random.sample(self.train_data, self.cfg.batch_size) - sample_expert_data = torch.stack(sample_expert_data).to(self.device) - sample_train_data = torch.stack(sample_train_data).to(self.device) - loss = self.learn(sample_train_data, sample_expert_data) + loss = self._train() self.tb_logger.add_scalar('reward_model/gail_loss', loss, self.train_iter) self.train_iter += 1 @@ -264,9 +188,9 @@ def estimate(self, data: list) -> List[Dict]: # otherwise the reward of data in the replay buffer will be incorrectly modified. train_data_augmented = self.reward_deepcopy(data) res = self.concat_state_action_pairs(train_data_augmented) - res = torch.stack(res).to(self.device) + res = res.to(self.device) with torch.no_grad(): - reward = self.reward_model(res).squeeze(-1).cpu() + reward = self.reward_model.forward(res).squeeze(-1).cpu() reward = torch.chunk(reward, reward.shape[0], dim=0) for item, rew in zip(train_data_augmented, reward): item['reward'] = -torch.log(rew + 1e-8) @@ -282,12 +206,21 @@ def collect_data(self, data: list) -> None: Effects: - This is a side effect function which updates the data attribute in ``self`` """ - self.train_data.extend(self.concat_state_action_pairs(data)) + data = self.concat_state_action_pairs(data) + data = torch.unbind(data, dim=0) + self.train_data.extend(data) - def clear_data(self) -> None: + def clear_data(self, iter: int) -> None: """ Overview: - Clearing training data. \ - This is a side effect function which clears the data attribute in ``self`` + Clearing training data. + Arguments: + - iter (:obj:`int`): Current training iteration + Returns / Effects: + This can be a side effect function which clears the data attribute in ``self`` """ - self.train_data.clear() + assert hasattr( + self.cfg, 'clear_buffer_per_iters' + ), "Reward Model does not have clear_buffer_per_iters, Clear failed" + if iter % self.cfg.clear_buffer_per_iters == 0: + self.train_data.clear() diff --git a/ding/reward_model/guided_cost_reward_model.py b/ding/reward_model/guided_cost_reward_model.py index 437e198f53..1c237f2651 100644 --- a/ding/reward_model/guided_cost_reward_model.py +++ b/ding/reward_model/guided_cost_reward_model.py @@ -1,8 +1,9 @@ from typing import List, Dict, Any from easydict import EasyDict +import pickle +import random import torch -import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions import Independent, Normal @@ -10,27 +11,7 @@ from ding.utils import REWARD_MODEL_REGISTRY from ding.utils.data import default_collate from .base_reward_model import BaseRewardModel - - -class GuidedCostNN(nn.Module): - - def __init__( - self, - input_size, - hidden_size=128, - output_size=1, - ): - super(GuidedCostNN, self).__init__() - self.net = nn.Sequential( - nn.Linear(input_size, hidden_size), - nn.ReLU(), - nn.Linear(hidden_size, hidden_size), - nn.ReLU(), - nn.Linear(hidden_size, output_size), - ) - - def forward(self, x): - return self.net(x) +from .network import GCLNetwork @REWARD_MODEL_REGISTRY.register('guided_cost') @@ -55,10 +36,6 @@ class GuidedCostRewardModel(BaseRewardModel): 5 | ``batch_size`` int 64 | Training batch size | 6 | ``hidden_size`` int 128 | Linear model hidden size | 7 | ``action_shape`` int 1 | Action space shape | - 8 | ``log_every_n`` int 50 | add loss to log every n iteration | - | ``_train`` | | - 9 | ``store_model_`` int 100 | save model every n iteration | - | ``every_n_train`` | == ==================== ======== ============= ======================================== ================ """ @@ -80,24 +57,55 @@ class GuidedCostRewardModel(BaseRewardModel): # Bigger "update_per_collect" means bigger off-policy. # collect data -> update policy-> collect data -> ... update_per_collect=100, - # (int) Add loss to log every n iteration. - log_every_n_train=50, - # (int) Save model every n iteration. - store_model_every_n_train=100, ) def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> None: # noqa super(GuidedCostRewardModel, self).__init__() self.cfg = config - self.action_shape = self.cfg.action_shape assert device == "cpu" or device.startswith("cuda") self.device = device self.tb_logger = tb_logger - self.reward_model = GuidedCostNN(config.input_size, config.hidden_size) + self.iter = 0 + self.reward_model = GCLNetwork( + config.input_size, [config.hidden_size, config.hidden_size], + output_size=1, + action_shape=config.action_shape + ) self.reward_model.to(self.device) self.opt = optim.Adam(self.reward_model.parameters(), lr=config.learning_rate) + self.train_data = [] + self.load_expert_data() + + def load_expert_data(self) -> None: + """ + Overview: + Getting the expert data from ``config['expert_data_path']`` attribute in self. + Effects: + This is a side effect function which updates the expert data attribute (e.g. ``self.expert_data``) + """ + with open(self.cfg.expert_data_path, 'rb') as f: + self.expert_data = pickle.load(f) + + def train(self) -> None: + """ + Overview: + Train the reward model. + """ + # sample data for expert and train data + sample_size = min(len(self.expert_data), self.cfg.batch_size) + expert_demo = random.sample(self.expert_data, sample_size) + samp = random.sample(self.train_data, sample_size) + + # remove non-tensor data in data list + samp = self._remove_redundant_keys(samp) + + # train the reward model + for _ in range(self.cfg.update_per_collect): + loss_ioc = self._train(expert_demo, samp) + self.tb_logger.add_scalar('reward_model/loss_iter', loss_ioc, self.iter) + self.iter += 1 - def train(self, expert_demo: torch.Tensor, samp: torch.Tensor, iter, step): + def _train(self, expert_demo: torch.Tensor, samp: torch.Tensor) -> float: device_0 = expert_demo[0]['obs'].device device_1 = samp[0]['obs'].device for i in range(len(expert_demo)): @@ -118,23 +126,13 @@ def train(self, expert_demo: torch.Tensor, samp: torch.Tensor, iter, step): samp.extend(expert_demo) expert_demo = default_collate(expert_demo) samp = default_collate(samp) - cost_demo = self.reward_model( - torch.cat([expert_demo['obs'], expert_demo['action'].float().reshape(-1, self.action_shape)], dim=-1) - ) - cost_samp = self.reward_model( - torch.cat([samp['obs'], samp['action'].float().reshape(-1, self.action_shape)], dim=-1) - ) - - prob = samp['prob'].unsqueeze(-1) - loss_IOC = torch.mean(cost_demo) + \ - torch.log(torch.mean(torch.exp(-cost_samp)/(prob+1e-7))) + loss_IOC = self.reward_model.learn(expert_demo, samp) # UPDATING THE COST FUNCTION self.opt.zero_grad() loss_IOC.backward() self.opt.step() - if iter % self.cfg.log_every_n_train == 0: - self.tb_logger.add_scalar('reward_model/loss_iter', loss_IOC, iter) - self.tb_logger.add_scalar('reward_model/loss_step', loss_IOC, step) + + return loss_IOC.item() def estimate(self, data: list) -> List[Dict]: # NOTE: this estimate method of gcl alg. is a little different from the one in other irl alg., @@ -142,7 +140,7 @@ def estimate(self, data: list) -> List[Dict]: train_data_augmented = data for i in range(len(train_data_augmented)): with torch.no_grad(): - reward = self.reward_model( + reward = self.reward_model.forward( torch.cat([train_data_augmented[i]['obs'], train_data_augmented[i]['action'].float()]).unsqueeze(0) ).squeeze(0) train_data_augmented[i]['reward'] = -reward @@ -156,9 +154,9 @@ def collect_data(self, data) -> None: if online_net is trained continuously, there should be some implementations in collect_data method """ # if online_net is trained continuously, there should be some implementations in collect_data method - pass + self.train_data.extend(data) - def clear_data(self): + def clear_data(self, iter: int): """ Overview: Collecting clearing data, not implemented if reward model (i.e. online_net) is only trained ones, \ @@ -176,3 +174,23 @@ def state_dict_reward_model(self) -> Dict[str, Any]: def load_state_dict_reward_model(self, state_dict: Dict[str, Any]) -> None: self.reward_model.load_state_dict(state_dict['model']) self.opt.load_state_dict(state_dict['optimizer']) + + def _remove_redundant_keys(self, samp: List[Dict]) -> List[Dict]: + """ + Overview: + Remove redundant keys in the data list. + Arguments: + - samp (:obj:`List[Dict]`): The data list. + Returns: + - (:obj:`List[Dict]`): The data list without redundant keys. + """ + keeped_keys = ['obs', 'next_obs', 'action', 'logit'] + assert samp is not None and bool(samp), "samp is empty." + assert all(key in samp[0] for key in keeped_keys), "samp is missing required keys." + fixed_samp = [] + for item in samp: + fixed_item = {} + for key in keeped_keys: + fixed_item[key] = item[key] + fixed_samp.append(fixed_item) + return fixed_samp diff --git a/ding/reward_model/icm_reward_model.py b/ding/reward_model/icm_reward_model.py index 9cc6e23e9b..d988a1aed5 100644 --- a/ding/reward_model/icm_reward_model.py +++ b/ding/reward_model/icm_reward_model.py @@ -3,13 +3,12 @@ import random import torch -import torch.nn as nn import torch.optim as optim -from ding.utils import SequenceType, REWARD_MODEL_REGISTRY -from ding.model import FCEncoder, ConvEncoder -from ding.torch_utils import one_hot +from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel +from .network import ICMNetwork +from .reward_model_utils import combine_intrinsic_exterinsic_reward def collect_states(iterator: list) -> Tuple[list, list, list]: @@ -26,102 +25,6 @@ def collect_states(iterator: list) -> Tuple[list, list, list]: return states, next_states, actions -class ICMNetwork(nn.Module): - """ - Intrinsic Curiosity Model (ICM Module) - Implementation of: - [1] Curiosity-driven Exploration by Self-supervised Prediction - Pathak, Agrawal, Efros, and Darrell - UC Berkeley - ICML 2017. - https://arxiv.org/pdf/1705.05363.pdf - [2] Code implementation reference: - https://github.com/pathak22/noreward-rl - https://github.com/jcwleo/curiosity-driven-exploration-pytorch - - 1) Embedding observations into a latent space - 2) Predicting the action logit given two consecutive embedded observations - 3) Predicting the next embedded obs, given the embeded former observation and action - """ - - def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType, action_shape: int) -> None: - super(ICMNetwork, self).__init__() - if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.feature = FCEncoder(obs_shape, hidden_size_list) - elif len(obs_shape) == 3: - self.feature = ConvEncoder(obs_shape, hidden_size_list) - else: - raise KeyError( - "not support obs_shape for pre-defined encoder: {}, please customize your own ICM model". - format(obs_shape) - ) - self.action_shape = action_shape - feature_output = hidden_size_list[-1] - self.inverse_net = nn.Sequential(nn.Linear(feature_output * 2, 512), nn.ReLU(), nn.Linear(512, action_shape)) - self.residual = nn.ModuleList( - [ - nn.Sequential( - nn.Linear(action_shape + 512, 512), - nn.LeakyReLU(), - nn.Linear(512, 512), - ) for _ in range(8) - ] - ) - self.forward_net_1 = nn.Sequential(nn.Linear(action_shape + feature_output, 512), nn.LeakyReLU()) - self.forward_net_2 = nn.Linear(action_shape + 512, feature_output) - - def forward(self, state: torch.Tensor, next_state: torch.Tensor, - action_long: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - r""" - Overview: - Use observation, next_observation and action to genearte ICM module - Parameter updates with ICMNetwork forward setup. - Arguments: - - state (:obj:`torch.Tensor`): - The current state batch - - next_state (:obj:`torch.Tensor`): - The next state batch - - action_long (:obj:`torch.Tensor`): - The action batch - Returns: - - real_next_state_feature (:obj:`torch.Tensor`): - Run with the encoder. Return the real next_state's embedded feature. - - pred_next_state_feature (:obj:`torch.Tensor`): - Run with the encoder and residual network. Return the predicted next_state's embedded feature. - - pred_action_logit (:obj:`torch.Tensor`): - Run with the encoder. Return the predicted action logit. - Shapes: - - state (:obj:`torch.Tensor`): :math:`(B, N)`, where B is the batch size and N is ''obs_shape'' - - next_state (:obj:`torch.Tensor`): :math:`(B, N)`, where B is the batch size and N is ''obs_shape'' - - action_long (:obj:`torch.Tensor`): :math:`(B)`, where B is the batch size'' - - real_next_state_feature (:obj:`torch.Tensor`): :math:`(B, M)`, where B is the batch size - and M is embedded feature size - - pred_next_state_feature (:obj:`torch.Tensor`): :math:`(B, M)`, where B is the batch size - and M is embedded feature size - - pred_action_logit (:obj:`torch.Tensor`): :math:`(B, A)`, where B is the batch size - and A is the ''action_shape'' - """ - action = one_hot(action_long, num=self.action_shape) - encode_state = self.feature(state) - encode_next_state = self.feature(next_state) - # get pred action logit - concat_state = torch.cat((encode_state, encode_next_state), 1) - pred_action_logit = self.inverse_net(concat_state) - # --------------------- - - # get pred next state - pred_next_state_feature_orig = torch.cat((encode_state, action), 1) - pred_next_state_feature_orig = self.forward_net_1(pred_next_state_feature_orig) - - # residual - for i in range(4): - pred_next_state_feature = self.residual[i * 2](torch.cat((pred_next_state_feature_orig, action), 1)) - pred_next_state_feature_orig = self.residual[i * 2 + 1]( - torch.cat((pred_next_state_feature, action), 1) - ) + pred_next_state_feature_orig - pred_next_state_feature = self.forward_net_2(torch.cat((pred_next_state_feature_orig, action), 1)) - real_next_state_feature = encode_next_state - return real_next_state_feature, pred_next_state_feature, pred_action_logit - - @REWARD_MODEL_REGISTRY.register('icm') class ICMRewardModel(BaseRewardModel): """ @@ -145,19 +48,22 @@ class ICMRewardModel(BaseRewardModel): list]) 5 | ``action_shape`` int 7 | the action space shape | 6 | ``batch_size`` int 64 | Training batch size | - 7 | ``hidden`` list [64, 64, | the MLP layer shape | + 7 | ``residual_num`` int 4 | the residual number of residual net | + 8 | ``hidden`` list [64, 64, | the MLP layer shape | | ``_size_list`` (int) 128] | | - 8 | ``update_per_`` int 100 | Number of updates per collect | + 9 | ``inverse_`` int 512 | the inverse model hidden size | + | ``hidden_size`` | | + 10 | ``update_per_`` int 100 | Number of updates per collect | | ``collect`` | | - 9 | ``reverse_scale`` float 1 | the importance weight of the | - | forward and reverse loss | - 10 | ``intrinsic_`` float 0.003 | the weight of intrinsic reward | r = w*r_i + r_e + 11 | ``reverse_loss`` float 1 | the importance weight of the | + ``_weight`` | forward and reverse loss | + 12 | ``intrinsic_`` float 0.003 | the weight of intrinsic reward | r = w*r_i + r_e ``reward_weight`` - 11 | ``extrinsic_`` bool True | Whether to normlize + 13 | ``extrinsic_`` bool True | Whether to normlize ``reward_norm`` | extrinsic reward - 12 | ``extrinsic_`` int 1 | the upper bound of the reward + 14 | ``extrinsic_`` int 1 | the upper bound of the reward ``reward_norm_max`` | normalization - 13 | ``clear_buffer`` int 1 | clear buffer per fixed iters | make sure replay + 15 | ``clear_buffer`` int 1 | clear buffer per fixed iters | make sure replay ``_per_iters`` | buffer's data count | isn't too few. | (code work in entry) @@ -178,12 +84,16 @@ class ICMRewardModel(BaseRewardModel): batch_size=64, # (list) The MLP layer shape. hidden_size_list=[64, 64, 128], + # (int) the residual number. + residual_num=4, + # (int) The hidden layer shape of inverse network + inverse_hidden_size=512, # (int) How many updates(iterations) to train after collector's one collection. # Bigger "update_per_collect" means bigger off-policy. # collect data -> update policy-> collect data -> ... update_per_collect=100, # (float) The importance weight of the forward and reverse loss. - reverse_scale=1, + reverse_loss_weight=1, # (float) The weight of intrinsic reward. # r = intrinsic_reward_weight * r_i + r_e. intrinsic_reward_weight=0.003, # 1/300 @@ -202,7 +112,10 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> assert device == "cpu" or device.startswith("cuda") self.device = device self.tb_logger = tb_logger - self.reward_model = ICMNetwork(config.obs_shape, config.hidden_size_list, config.action_shape) + self.reward_model = ICMNetwork( + config.obs_shape, config.hidden_size_list, config.residual_num, config.inverse_hidden_size, + config.action_shape + ) self.reward_model.to(self.device) self.intrinsic_reward_type = config.intrinsic_reward_type assert self.intrinsic_reward_type in ['add', 'new', 'assign'] @@ -211,15 +124,11 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> self.train_next_states = [] self.train_actions = [] self.opt = optim.Adam(self.reward_model.parameters(), config.learning_rate) - self.ce = nn.CrossEntropyLoss(reduction="mean") - self.forward_mse = nn.MSELoss(reduction='none') - self.reverse_scale = config.reverse_scale - self.res = nn.Softmax(dim=-1) + self.reverse_loss_weight = config.reverse_loss_weight self.estimate_cnt_icm = 0 self.train_cnt_icm = 0 - def _train(self) -> None: - self.train_cnt_icm += 1 + def _train(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, float]: train_data_list = [i for i in range(0, len(self.train_states))] train_data_index = random.sample(train_data_list, self.cfg.batch_size) data_states: list = [self.train_states[i] for i in train_data_index] @@ -229,26 +138,26 @@ def _train(self) -> None: data_actions: list = [self.train_actions[i] for i in train_data_index] data_actions: torch.Tensor = torch.cat(data_actions).to(self.device) - real_next_state_feature, pred_next_state_feature, pred_action_logit = self.reward_model( + # action_accuracy: the accuracy of predicting the action. + inverse_loss, forward_loss, action_accuracy = self.reward_model.learn( data_states, data_next_states, data_actions ) - inverse_loss = self.ce(pred_action_logit, data_actions.long()) - forward_loss = self.forward_mse(pred_next_state_feature, real_next_state_feature.detach()).mean() - self.tb_logger.add_scalar('icm_reward/forward_loss', forward_loss, self.train_cnt_icm) - self.tb_logger.add_scalar('icm_reward/inverse_loss', inverse_loss, self.train_cnt_icm) - action = torch.argmax(self.res(pred_action_logit), -1) - accuracy = torch.sum(action == data_actions.squeeze(-1)).item() / data_actions.shape[0] - self.tb_logger.add_scalar('icm_reward/action_accuracy', accuracy, self.train_cnt_icm) - loss = self.reverse_scale * inverse_loss + forward_loss - self.tb_logger.add_scalar('icm_reward/total_loss', loss, self.train_cnt_icm) - loss = self.reverse_scale * inverse_loss + forward_loss + loss = self.reverse_loss_weight * inverse_loss + forward_loss + self.opt.zero_grad() loss.backward() self.opt.step() + return loss, inverse_loss, forward_loss, action_accuracy + def train(self) -> None: for _ in range(self.cfg.update_per_collect): - self._train() + loss, inverse_loss, forward_loss, action_accuracy = self._train() + self.tb_logger.add_scalar('icm_reward/total_loss', loss, self.train_cnt_icm) + self.tb_logger.add_scalar('icm_reward/forward_loss', forward_loss, self.train_cnt_icm) + self.tb_logger.add_scalar('icm_reward/inverse_loss', inverse_loss, self.train_cnt_icm) + self.tb_logger.add_scalar('icm_reward/action_accuracy', action_accuracy, self.train_cnt_icm) + self.train_cnt_icm += 1 def estimate(self, data: list) -> List[Dict]: # NOTE: deepcopy reward part of data is very important, @@ -259,33 +168,18 @@ def estimate(self, data: list) -> List[Dict]: next_states = torch.stack(next_states).to(self.device) actions = torch.cat(actions).to(self.device) with torch.no_grad(): - real_next_state_feature, pred_next_state_feature, _ = self.reward_model(states, next_states, actions) - raw_icm_reward = self.forward_mse(real_next_state_feature, pred_next_state_feature).mean(dim=1) + raw_icm_reward = self.reward_model.forward(states, next_states, actions) self.estimate_cnt_icm += 1 self.tb_logger.add_scalar('icm_reward/raw_icm_reward_max', raw_icm_reward.max(), self.estimate_cnt_icm) self.tb_logger.add_scalar('icm_reward/raw_icm_reward_mean', raw_icm_reward.mean(), self.estimate_cnt_icm) self.tb_logger.add_scalar('icm_reward/raw_icm_reward_min', raw_icm_reward.min(), self.estimate_cnt_icm) self.tb_logger.add_scalar('icm_reward/raw_icm_reward_std', raw_icm_reward.std(), self.estimate_cnt_icm) - icm_reward = (raw_icm_reward - raw_icm_reward.min()) / (raw_icm_reward.max() - raw_icm_reward.min() + 1e-8) - self.tb_logger.add_scalar('icm_reward/icm_reward_max', icm_reward.max(), self.estimate_cnt_icm) - self.tb_logger.add_scalar('icm_reward/icm_reward_mean', icm_reward.mean(), self.estimate_cnt_icm) - self.tb_logger.add_scalar('icm_reward/icm_reward_min', icm_reward.min(), self.estimate_cnt_icm) - self.tb_logger.add_scalar('icm_reward/icm_reward_std', icm_reward.std(), self.estimate_cnt_icm) - icm_reward = (raw_icm_reward - raw_icm_reward.min()) / (raw_icm_reward.max() - raw_icm_reward.min() + 1e-8) - icm_reward = icm_reward.to(self.device) - for item, icm_rew in zip(train_data_augmented, icm_reward): - if self.intrinsic_reward_type == 'add': - if self.cfg.extrinsic_reward_norm: - item['reward'] = item[ - 'reward'] / self.cfg.extrinsic_reward_norm_max + icm_rew * self.cfg.intrinsic_reward_weight - else: - item['reward'] = item['reward'] + icm_rew * self.cfg.intrinsic_reward_weight - elif self.intrinsic_reward_type == 'new': - item['intrinsic_reward'] = icm_rew - if self.cfg.extrinsic_reward_norm: - item['reward'] = item['reward'] / self.cfg.extrinsic_reward_norm_max - elif self.intrinsic_reward_type == 'assign': - item['reward'] = icm_rew + normalized_icm_reward = (raw_icm_reward - + raw_icm_reward.min()) / (raw_icm_reward.max() - raw_icm_reward.min() + 1e-8) + normalized_icm_reward = normalized_icm_reward.to(self.device) + train_data_augmented = combine_intrinsic_exterinsic_reward( + train_data_augmented, normalized_icm_reward, self.cfg + ) return train_data_augmented @@ -296,11 +190,15 @@ def collect_data(self, data: list) -> None: self.train_next_states.extend(next_states) self.train_actions.extend(actions) - def clear_data(self) -> None: - self.train_data.clear() - self.train_states.clear() - self.train_next_states.clear() - self.train_actions.clear() + def clear_data(self, iter: int) -> None: + assert hasattr( + self.cfg, 'clear_buffer_per_iters' + ), "Reward Model does not have clear_buffer_per_iters, Clear failed" + if iter % self.cfg.clear_buffer_per_iters == 0: + self.train_data.clear() + self.train_states.clear() + self.train_next_states.clear() + self.train_actions.clear() def state_dict(self) -> Dict: return self.reward_model.state_dict() diff --git a/ding/reward_model/network.py b/ding/reward_model/network.py new file mode 100644 index 0000000000..567f395002 --- /dev/null +++ b/ding/reward_model/network.py @@ -0,0 +1,447 @@ +from typing import Union, Tuple, List, Dict, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ding.utils import SequenceType +from ding.model import FCEncoder, ConvEncoder +from ding.torch_utils import one_hot +from ding.utils import RunningMeanStd + + +class RepresentationNetwork(nn.Module): + + def __init__( + self, + obs_shape: Union[int, SequenceType], + hidden_size_list: SequenceType, + activation: Optional[nn.Module] = nn.ReLU(), + kernel_size: Optional[SequenceType] = [8, 4, 3], + stride: Optional[SequenceType] = [4, 2, 1], + ) -> None: + super(RepresentationNetwork, self).__init__() + if isinstance(obs_shape, int) or len(obs_shape) == 1: + self.feature = FCEncoder(obs_shape, hidden_size_list, activation=activation) + elif len(obs_shape) == 3: + self.feature = ConvEncoder( + obs_shape, hidden_size_list, activation=activation, kernel_size=kernel_size, stride=stride + ) + else: + raise KeyError( + "not support obs_shape for pre-defined encoder: {}, please customize your own Representation Network". + format(obs_shape) + ) + + def forward(self, obs: torch.Tensor) -> torch.Tensor: + feature_output = self.feature(obs) + return feature_output + + +class RNDNetwork(nn.Module): + + def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType) -> None: + super(RNDNetwork, self).__init__() + self.target = RepresentationNetwork(obs_shape, hidden_size_list) + self.predictor = RepresentationNetwork(obs_shape, hidden_size_list) + + for param in self.target.parameters(): + param.requires_grad = False + + def forward(self, obs: torch.Tensor, norm: Optional[bool] = True) -> torch.Tensor: + with torch.no_grad(): + predict_feature = self.predictor(obs) + target_feature = self.target(obs) + reward = F.mse_loss(predict_feature, target_feature, reduction='none').mean(dim=1) + if norm: + reward = (reward - reward.min()) / (reward.max() - reward.min() + 1e-8) + return reward + + def learn(self, obs: torch.Tensor) -> torch.Tensor: + predict_feature = self.predictor(obs) + with torch.no_grad(): + target_feature = self.target(obs) + loss = F.mse_loss(predict_feature, target_feature.detach()) + return loss + + +class REDNetwork(RNDNetwork): + + def __init__( + self, + obs_shape: int, + action_shape: int, + hidden_size_list: SequenceType, + sigma: Optional[float] = 0.5 + ) -> None: + # RED network does not support high dimension obs + super().__init__(obs_shape + action_shape, hidden_size_list) + self.sigma = sigma + + def forward(self, obs: torch.Tensor) -> torch.Tensor: + with torch.no_grad(): + predict_feature = self.predictor(obs) + target_feature = self.target(obs) + reward = F.mse_loss(predict_feature, target_feature, reduction='none').mean(dim=1) + reward = torch.exp(-self.sigma * reward) + return reward + + +class GAILNetwork(nn.Module): + + def __init__( + self, + obs_shape: Union[int, SequenceType], + hidden_size_list: SequenceType, + kernel_size: Optional[SequenceType] = None, + stride: Optional[SequenceType] = None, + activation: Optional[nn.Module] = nn.ReLU(), + action_shape: Optional[int] = None + ) -> None: + super(GAILNetwork, self).__init__() + # Gail will need one more fc layer after RepresentationNetwork, and it will use another activation function + self.act = nn.Sigmoid() + if isinstance(obs_shape, int) or len(obs_shape) == 1: + self.feature = RepresentationNetwork(obs_shape, hidden_size_list, activation) + self.fc = nn.Linear(hidden_size_list[0], 1) + self.image_input = False + elif len(obs_shape) == 3: + self.action_size = action_shape + self.obs_size = obs_shape + self.feature = RepresentationNetwork(obs_shape, hidden_size_list, activation, kernel_size, stride) + self.fc = nn.Linear(64 + self.action_size, 1) + self.image_input = True + + def learn(self, train_data: torch.Tensor, expert_data: torch.Tensor) -> torch.Tensor: + out_1: torch.Tensor = self.forward(train_data) + loss_1: torch.Tensor = torch.log(out_1 + 1e-8).mean() + out_2: torch.Tensor = self.forward(expert_data) + loss_2: torch.Tensor = torch.log(1 - out_2 + 1e-8).mean() + + loss: torch.Tensor = -(loss_1 + loss_2) + + return loss + + def forward(self, data: torch.Tensor) -> torch.Tensor: + if self.image_input: + # input: x = [B, 4 x 84 x 84 + self.action_size], last element is action + actions = data[:, -self.action_size:] # [B, self.action_size] + # get observations + obs = data[:, :-self.action_size] + obs = obs.reshape([-1] + self.obs_size) # [B, 4, 84, 84] + obs = self.feature(obs) + data = torch.cat([obs, actions], dim=-1) + else: + data = self.feature(data) + + data = self.fc(data) + reward = self.act(data) + return reward + + +class ICMNetwork(nn.Module): + """ + Intrinsic Curiosity Model (ICM Module) + Implementation of: + [1] Curiosity-driven Exploration by Self-supervised Prediction + Pathak, Agrawal, Efros, and Darrell - UC Berkeley - ICML 2017. + https://arxiv.org/pdf/1705.05363.pdf + [2] Code implementation reference: + https://github.com/pathak22/noreward-rl + https://github.com/jcwleo/curiosity-driven-exploration-pytorch + + 1) Embedding observations into a latent space + 2) Predicting the action logit given two consecutive embedded observations + 3) Predicting the next embedded obs, given the embeded former observation and action + """ + + def __init__( + self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType, residual_number: int, + inverse_hidden_size: int, action_shape: int + ) -> None: + super(ICMNetwork, self).__init__() + self.action_shape = action_shape + feature_output = hidden_size_list[-1] + self.feature = RepresentationNetwork(obs_shape, hidden_size_list) + self.inverse_net = nn.Sequential( + nn.Linear(feature_output * 2, inverse_hidden_size), nn.ReLU(), nn.Linear(inverse_hidden_size, action_shape) + ) + self.residual_number = residual_number + self.residual = nn.ModuleList( + [ + nn.Sequential( + nn.Linear(action_shape + 512, 512), + nn.LeakyReLU(), + nn.Linear(512, 512), + ) for _ in range(self.residual_number * 2) + ] + ) + self.forward_net_backbone = RepresentationNetwork(action_shape + feature_output, [512], nn.LeakyReLU()) + self.forward_net_head = nn.Linear(action_shape + 512, feature_output) + + def _forward(self, state: torch.Tensor, next_state: torch.Tensor, + action_long: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Overview: + Use observation, next_observation and action to genearte ICM module + Parameter updates with ICMNetwork forward setup. + Arguments: + - state (:obj:`torch.Tensor`): + The current state batch + - next_state (:obj:`torch.Tensor`): + The next state batch + - action_long (:obj:`torch.Tensor`): + The action batch + Returns: + - real_next_state_feature (:obj:`torch.Tensor`): + Run with the encoder. Return the real next_state's embedded feature. + - pred_next_state_feature (:obj:`torch.Tensor`): + Run with the encoder and residual network. Return the predicted next_state's embedded feature. + - pred_action_logit (:obj:`torch.Tensor`): + Run with the encoder. Return the predicted action logit. + Shapes: + - state (:obj:`torch.Tensor`): :math:`(B, N)`, where B is the batch size and N is ''obs_shape'' + - next_state (:obj:`torch.Tensor`): :math:`(B, N)`, where B is the batch size and N is ''obs_shape'' + - action_long (:obj:`torch.Tensor`): :math:`(B)`, where B is the batch size'' + - real_next_state_feature (:obj:`torch.Tensor`): :math:`(B, M)`, where B is the batch size + and M is embedded feature size + - pred_next_state_feature (:obj:`torch.Tensor`): :math:`(B, M)`, where B is the batch size + and M is embedded feature size + - pred_action_logit (:obj:`torch.Tensor`): :math:`(B, A)`, where B is the batch size + and A is the ''action_shape'' + """ + # use feature network to encode state and next state + # feature network will use to identify whether the state has been seen + action = one_hot(action_long, num=self.action_shape) + encode_state = self.feature(state) + encode_next_state = self.feature(next_state) + # get pred action logit + concat_state = torch.cat((encode_state, encode_next_state), 1) + pred_action_logit = self.inverse_net(concat_state) + + # get pred next state + pred_next_state_feature_orig = torch.cat((encode_state, action), 1) + pred_next_state_feature_orig = self.forward_net_backbone(pred_next_state_feature_orig) + + # residual + for i in range(self.residual_number): + pred_next_state_feature = self.residual[i * 2](torch.cat((pred_next_state_feature_orig, action), 1)) + pred_next_state_feature_orig = self.residual[i * 2 + 1]( + torch.cat((pred_next_state_feature, action), 1) + ) + pred_next_state_feature_orig + pred_next_state_feature = self.forward_net_head(torch.cat((pred_next_state_feature_orig, action), 1)) + real_next_state_feature = encode_next_state + return real_next_state_feature, pred_next_state_feature, pred_action_logit + + def learn(self, state: torch.Tensor, next_state: torch.Tensor, + action_long: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, float]: + real_next_state_feature, pred_next_state_feature, pred_action_logit = self._forward( + state, next_state, action_long + ) + + inverse_loss = F.cross_entropy(pred_action_logit, action_long.long()) + forward_loss = F.mse_loss(pred_next_state_feature, real_next_state_feature.detach()).mean() + action = torch.argmax(F.softmax(pred_action_logit), -1) + accuracy = torch.sum(action == action_long.squeeze(-1)).item() / action_long.shape[0] + return inverse_loss, forward_loss, accuracy + + def forward(self, state: torch.Tensor, next_state: torch.Tensor, action_long: torch.Tensor) -> torch.Tensor: + with torch.no_grad(): + real_next_state_feature, pred_next_state_feature, _ = self._forward(state, next_state, action_long) + reward = F.mse_loss(real_next_state_feature, pred_next_state_feature, reduction="none").mean(dim=1) + + return reward + + +class GCLNetwork(nn.Module): + + def __init__( + self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType, output_size: int, + action_shape: int + ) -> None: + super(GCLNetwork, self).__init__() + self.feature = RepresentationNetwork(obs_shape, hidden_size_list) + self.fc = nn.Linear(hidden_size_list[-1], output_size) + self.action_shape = action_shape + + def forward(self, data: torch.Tensor) -> torch.Tensor: + reward = self.feature(data) + reward = self.fc(reward) + + return reward + + def learn(self, expert_demo: torch.Tensor, samp: torch.Tensor) -> torch.Tensor: + cost_demo = self.forward( + torch.cat([expert_demo['obs'], expert_demo['action'].float().reshape(-1, self.action_shape)], dim=-1) + ) + cost_samp = self.forward( + torch.cat([samp['obs'], samp['action'].float().reshape(-1, self.action_shape)], dim=-1) + ) + prob = samp['prob'].unsqueeze(-1) + loss_IOC = torch.mean(cost_demo) + \ + torch.log(torch.mean(torch.exp(-cost_samp)/(prob+1e-7))) + + return loss_IOC + + +class TREXNetwork(nn.Module): + + def __init__( + self, + obs_shape: Union[int, SequenceType], + hidden_size_list: SequenceType, + kernel_size: Optional[SequenceType] = None, + stride: Optional[SequenceType] = None, + activation: Optional[nn.Module] = nn.ReLU(), + l1_reg: Optional[float] = 0, + ) -> None: + super(TREXNetwork, self).__init__() + self.input_size = obs_shape + self.l1_reg = l1_reg + self.output_size = hidden_size_list[-1] + hidden_size_list = hidden_size_list[:-1] + self.feature = RepresentationNetwork(obs_shape, hidden_size_list, activation, kernel_size, stride) + self.act = activation + self.fc = nn.Linear(hidden_size_list[-1], self.output_size) + + def forward(self, data: torch.Tensor) -> torch.Tensor: + reward = self.feature(data) + if isinstance(self.input_size, int) is False and len(self.input_size) == 3: + reward = self.act(reward) + reward = self.fc(reward) + return reward + + def learn(self, traj_i: torch.Tensor, traj_j: torch.Tensor, labels: torch.Tensor) -> torch.Tensor: + outputs, total_abs_reward = self.get_outputs_abs_reward(traj_i, traj_j) + outputs = outputs.unsqueeze(0) + loss = F.cross_entropy(outputs, labels) + self.l1_reg * total_abs_reward + return loss + + def get_outputs_abs_reward(self, traj_i: torch.Tensor, traj_j: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + reward_i = self.forward(traj_i) + reward_j = self.forward(traj_j) + + cum_r_i = torch.sum(reward_i) + cum_r_j = torch.sum(reward_j) + outputs = torch.cat((cum_r_i.unsqueeze(0), cum_r_j.unsqueeze(0)), 0) + total_abs_reward = torch.sum(torch.abs(reward_i)) + torch.sum(torch.abs(reward_j)) + + return outputs, total_abs_reward + + +class InverseNetwork(nn.Module): + """ + Overview: + Network used in Episodic reward model for NGU. + """ + + def __init__( + self, obs_shape: Union[int, SequenceType], action_shape: int, hidden_size_list: SequenceType, device: str + ) -> None: + super(InverseNetwork, self).__init__() + self._running_mean_std_episodic_dist = RunningMeanStd(epsilon=1e-4) + self.embedding_net = RepresentationNetwork(obs_shape, hidden_size_list) + self.obs_shape = obs_shape + self.device = device + self.inverse_net = nn.Sequential( + nn.Linear(hidden_size_list[-1] * 2, 512), nn.ReLU(inplace=True), nn.Linear(512, action_shape) + ) + + def _compute_intrinsic_reward( + self, + episodic_memory: List, + current_controllable_state: torch.Tensor, + k: int = 10, + kernel_cluster_distance: float = 0.008, + kernel_epsilon: float = 0.0001, + c: float = 0.001, + siminarity_max: int = 8, + ) -> torch.Tensor: + # this function is modified from https://github.com/Coac/never-give-up/blob/main/embedding_model.py + state_dist = torch.cdist(current_controllable_state.unsqueeze(0), episodic_memory, p=2).squeeze(0).sort()[0][:k] + self._running_mean_std_episodic_dist.update(state_dist.cpu().numpy()) + state_dist = state_dist / (self._running_mean_std_episodic_dist.mean + 1e-11) + + state_dist = torch.clamp(state_dist - kernel_cluster_distance, min=0, max=None) + kernel = kernel_epsilon / (state_dist + kernel_epsilon) + s = torch.sqrt(torch.clamp(torch.sum(kernel), min=0, max=None)) + c + + if s > siminarity_max: + print('s > siminarity_max:', s.max(), s.min()) + return torch.tensor(0) # NOTE + return 1 / s + # average value 1/( ( 10* 1e-4/(1+1e-4) )**(1/2)+1e-3 ) = 30 + + def forward(self, obs: list, is_null: list) -> Tuple[torch.Tensor, torch.Tensor]: + batch_size = len(obs) + seq_length = len(obs[0]) + + # stack episode dim + obs = [torch.stack(episode_obs, dim=0) for episode_obs in obs] + + # stack batch dim + # way 0 + if isinstance(self.obs_shape, int): + obs = torch.stack(obs, dim=0).view(batch_size * seq_length, self.obs_shape).to(self.device) + else: # len(self.cfg.obs_shape) == 3 for image obs + obs = torch.stack(obs, dim=0).view(batch_size * seq_length, *self.obs_shape).to(self.device) + # way 2 + # obs = torch.cat(obs, 0) + + with torch.no_grad(): + cur_obs_embedding = self.embedding_net(obs) + cur_obs_embedding = cur_obs_embedding.view(batch_size, seq_length, -1) + episodic_reward = [[] for _ in range(batch_size)] + null_cnt = 0 # the number of null transitions in the whole minibatch + for i in range(batch_size): + for j in range(seq_length): + if j < 10: + episodic_reward[i].append(torch.tensor(0.).to(self.device)) + elif j: + episodic_memory = cur_obs_embedding[i][:j] + reward = self._compute_intrinsic_reward(episodic_memory, + cur_obs_embedding[i][j]).to(self.device) + episodic_reward[i].append(reward) + + if torch.nonzero(torch.tensor(is_null[i]).float()).shape[0] != 0: + # TODO(pu): if have null padding, the episodic_reward should be 0 + null_start_index = int(torch.nonzero(torch.tensor(is_null[i]).float()).squeeze(-1)[0]) + # add the number of null transitions in i'th sequence in batch + null_cnt = null_cnt + seq_length - null_start_index + for k in range(null_start_index, seq_length): + episodic_reward[i][k] = torch.tensor(0).to(self.device) + # episodic_reward[i][null_start_index:-1]=[torch.tensor(0).to(self.device) + # for i in range(seq_length-null_start_index)] + + # list(list(tensor)) -> tensor + tmp = [torch.stack(episodic_reward_tmp, dim=0) for episodic_reward_tmp in episodic_reward] + # stack batch dim + episodic_reward = torch.stack(tmp, dim=0) # TODO(pu): image case + episodic_reward = episodic_reward.view(-1) # torch.Size([32, 42]) -> torch.Size([32*42] + + episodic_reward_real_mean = sum(episodic_reward) / ( + batch_size * seq_length - null_cnt + ) # TODO(pu): recompute mean + + return episodic_reward, episodic_reward_real_mean + + def learn(self, inputs: Dict) -> torch.Tensor: + """ + Overview: + Use observation, next_observation and action to train the inverse model + inputs must contain ['obs', 'next_obs', 'action'] + Arguments: + - obs (:obj:`torch.Tensor`): + The current observation + - next_obs (:obj:`torch.Tensor`): + The next observation + - action (:obj:`torch.Tensor`): + The action + """ + cur_obs_embedding = self.embedding_net(inputs['obs']) + next_obs_embedding = self.embedding_net(inputs['next_obs']) + # get pred action + obs_plus_next_obs = torch.cat([cur_obs_embedding, next_obs_embedding], dim=-1) + pred_action_logits = self.inverse_net(obs_plus_next_obs) + loss = F.cross_entropy(pred_action_logits, inputs['action'].squeeze(-1)) + return loss diff --git a/ding/reward_model/ngu_reward_model.py b/ding/reward_model/ngu_reward_model.py index 5a8758bdb7..12971e17d4 100644 --- a/ding/reward_model/ngu_reward_model.py +++ b/ding/reward_model/ngu_reward_model.py @@ -1,18 +1,16 @@ import copy import random -from typing import Union, Tuple, Dict, List import numpy as np import torch -import torch.nn as nn -import torch.nn.functional as F import torch.optim as optim +from tensorboardX import SummaryWriter from easydict import EasyDict -from ding.model import FCEncoder, ConvEncoder from ding.utils import RunningMeanStd -from ding.utils import SequenceType, REWARD_MODEL_REGISTRY +from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel +from .network import RNDNetwork, InverseNetwork def collect_data_and_exclude_null_data_rnd(data_in): @@ -69,31 +67,6 @@ def collect_data_episodic(data_in): return res, is_null_list -class RndNetwork(nn.Module): - - def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType) -> None: - super(RndNetwork, self).__init__() - if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.target = FCEncoder(obs_shape, hidden_size_list) - self.predictor = FCEncoder(obs_shape, hidden_size_list) - elif len(obs_shape) == 3: - self.target = ConvEncoder(obs_shape, hidden_size_list) - self.predictor = ConvEncoder(obs_shape, hidden_size_list) - else: - raise KeyError( - "not support obs_shape for pre-defined encoder: {}, " - "please customize your own RND model".format(obs_shape) - ) - for param in self.target.parameters(): - param.requires_grad = False - - def forward(self, obs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - predict_feature = self.predictor(obs) - with torch.no_grad(): - target_feature = self.target(obs) - return predict_feature, target_feature - - @REWARD_MODEL_REGISTRY.register('rnd-ngu') class RndNGURewardModel(BaseRewardModel): r""" @@ -116,28 +89,30 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> assert device == "cpu" or device.startswith("cuda") self.device = device self.tb_logger = tb_logger - self.reward_model = RndNetwork(config.obs_shape, config.hidden_size_list) + self.reward_model = RNDNetwork(config.obs_shape, config.hidden_size_list) self.reward_model.to(self.device) self.intrinsic_reward_type = config.intrinsic_reward_type assert self.intrinsic_reward_type in ['add', 'new', 'assign'] self.train_data_total = [] self.train_data = [] - self.opt = optim.Adam(self.reward_model.predictor.parameters(), config.learning_rate) + self.train_cnt_icm = 0 self.estimate_cnt_rnd = 0 self._running_mean_std_rnd = RunningMeanStd(epsilon=1e-4) + self.opt = optim.Adam(self.reward_model.predictor.parameters(), config.learning_rate) self.only_use_last_five_frames = config.only_use_last_five_frames_for_icm_rnd - def _train(self) -> None: + def _train(self) -> torch.Tensor: train_data: list = random.sample(list(self.train_data_cur), self.cfg.batch_size) train_data: torch.Tensor = torch.stack(train_data).to(self.device) - predict_feature, target_feature = self.reward_model(train_data) - loss = F.mse_loss(predict_feature, target_feature.detach()) + loss = self.reward_model.learn(train_data) self.opt.zero_grad() loss.backward() self.opt.step() + return loss + def train(self) -> None: if self.only_use_last_five_frames: # self.train_obs shape list(list) [batch_size,seq_length,N @@ -167,7 +142,9 @@ def train(self) -> None: # self.train_data = tmp for _ in range(self.cfg.update_per_collect): - self._train() + loss = self._train() + self.tb_logger.add_scalar('rnd_reward/loss', loss, self.train_cnt_icm) + self.train_cnt_icm += 1 def estimate(self, data: list) -> torch.Tensor: """ @@ -180,8 +157,7 @@ def estimate(self, data: list) -> torch.Tensor: obs = torch.stack(obs).to(self.device) with torch.no_grad(): - predict_feature, target_feature = self.reward_model(obs) - reward = F.mse_loss(predict_feature, target_feature, reduction='none').mean(dim=1) + reward = self.reward_model.forward(obs, norm=False) self._running_mean_std_rnd.update(reward.cpu().numpy()) # transform to mean 1 std 1 reward = 1 + (reward - self._running_mean_std_rnd.mean) / (self._running_mean_std_rnd.std + 1e-11) @@ -209,39 +185,6 @@ def reward_deepcopy(self, train_data): return train_data_reward_deepcopy -class InverseNetwork(nn.Module): - - def __init__(self, obs_shape: Union[int, SequenceType], action_shape, hidden_size_list: SequenceType) -> None: - super(InverseNetwork, self).__init__() - if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.embedding_net = FCEncoder(obs_shape, hidden_size_list) - elif len(obs_shape) == 3: - self.embedding_net = ConvEncoder(obs_shape, hidden_size_list) - else: - raise KeyError( - "not support obs_shape for pre-defined encoder: {}, please customize your own RND model". - format(obs_shape) - ) - self.inverse_net = nn.Sequential( - nn.Linear(hidden_size_list[-1] * 2, 512), nn.ReLU(inplace=True), nn.Linear(512, action_shape) - ) - - def forward(self, inputs: Dict, inference: bool = False) -> Dict: - if inference: - with torch.no_grad(): - cur_obs_embedding = self.embedding_net(inputs['obs']) - return cur_obs_embedding - else: - # obs: torch.Tensor, next_obs: torch.Tensor - cur_obs_embedding = self.embedding_net(inputs['obs']) - next_obs_embedding = self.embedding_net(inputs['next_obs']) - # get pred action - obs_plus_next_obs = torch.cat([cur_obs_embedding, next_obs_embedding], dim=-1) - pred_action_logits = self.inverse_net(obs_plus_next_obs) - pred_action_probs = nn.Softmax(dim=-1)(pred_action_logits) - return pred_action_logits, pred_action_probs - - @REWARD_MODEL_REGISTRY.register('episodic') class EpisodicNGURewardModel(BaseRewardModel): r""" @@ -275,7 +218,9 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> assert device == "cpu" or device.startswith("cuda") self.device = device self.tb_logger = tb_logger - self.episodic_reward_model = InverseNetwork(config.obs_shape, config.action_shape, config.hidden_size_list) + self.episodic_reward_model = InverseNetwork( + config.obs_shape, config.action_shape, config.hidden_size_list, self.device + ) self.episodic_reward_model.to(self.device) self.intrinsic_reward_type = config.intrinsic_reward_type assert self.intrinsic_reward_type in ['add', 'new', 'assign'] @@ -283,11 +228,10 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> self.train_action_total = [] self.opt = optim.Adam(self.episodic_reward_model.parameters(), config.learning_rate) self.estimate_cnt_episodic = 0 - self._running_mean_std_episodic_dist = RunningMeanStd(epsilon=1e-4) - self._running_mean_std_episodic_reward = RunningMeanStd(epsilon=1e-4) + self.train_cnt_episodic = 0 self.only_use_last_five_frames = config.only_use_last_five_frames_for_icm_rnd - def _train(self) -> None: + def _train(self) -> torch.Tensor: # sample episode's timestep index train_index = np.random.randint(low=0, high=self.train_obs.shape[0], size=self.cfg.batch_size) @@ -295,14 +239,14 @@ def _train(self) -> None: train_next_obs: torch.Tensor = self.train_next_obs[train_index].to(self.device) train_action: torch.Tensor = self.train_action[train_index].to(self.device) - train_data = {'obs': train_obs, 'next_obs': train_next_obs} - pred_action_logits, pred_action_probs = self.episodic_reward_model(train_data) - - inverse_loss = F.cross_entropy(pred_action_logits, train_action.squeeze(-1)) + train_data = {'obs': train_obs, 'next_obs': train_next_obs, 'action': train_action} + inverse_loss = self.episodic_reward_model.learn(train_data) self.opt.zero_grad() inverse_loss.backward() self.opt.step() + return inverse_loss + def train(self) -> None: self.train_next_obs_total = copy.deepcopy(self.train_obs_total) @@ -332,32 +276,9 @@ def train(self) -> None: self.train_action = torch.cat(self.train_action, 0) for _ in range(self.cfg.update_per_collect): - self._train() - - def _compute_intrinsic_reward( - self, - episodic_memory: List, - current_controllable_state: torch.Tensor, - k=10, - kernel_cluster_distance=0.008, - kernel_epsilon=0.0001, - c=0.001, - siminarity_max=8, - ) -> torch.Tensor: - # this function is modified from https://github.com/Coac/never-give-up/blob/main/embedding_model.py - state_dist = torch.cdist(current_controllable_state.unsqueeze(0), episodic_memory, p=2).squeeze(0).sort()[0][:k] - self._running_mean_std_episodic_dist.update(state_dist.cpu().numpy()) - state_dist = state_dist / (self._running_mean_std_episodic_dist.mean + 1e-11) - - state_dist = torch.clamp(state_dist - kernel_cluster_distance, min=0, max=None) - kernel = kernel_epsilon / (state_dist + kernel_epsilon) - s = torch.sqrt(torch.clamp(torch.sum(kernel), min=0, max=None)) + c - - if s > siminarity_max: - print('s > siminarity_max:', s.max(), s.min()) - return torch.tensor(0) # NOTE - return 1 / s - # average value 1/( ( 10* 1e-4/(1+1e-4) )**(1/2)+1e-3 ) = 30 + loss = self._train() + self.tb_logger.add_scalar('episodic_reward/train_loss', loss, self.train_cnt_episodic) + self.train_cnt_episodic += 1 def estimate(self, data: list) -> torch.Tensor: """ @@ -365,63 +286,10 @@ def estimate(self, data: list) -> torch.Tensor: """ obs, is_null = collect_data_episodic(data) - # obs shape list(list()) [batch_size,seq_length,obs_dim] - batch_size = len(obs) - seq_length = len(obs[0]) - - # stack episode dim - obs = [torch.stack(episode_obs, dim=0) for episode_obs in obs] - # stack batch dim - # way 0 - if isinstance(self.cfg.obs_shape, int): - obs = torch.stack(obs, dim=0).view(batch_size * seq_length, self.cfg.obs_shape).to(self.device) - else: # len(self.cfg.obs_shape) == 3 for image obs - obs = torch.stack(obs, dim=0).view(batch_size * seq_length, *self.cfg.obs_shape).to(self.device) - # way 2 - # obs = torch.cat(obs, 0) - - inputs = {'obs': obs, 'is_null': is_null} with torch.no_grad(): - cur_obs_embedding = self.episodic_reward_model(inputs, inference=True) - cur_obs_embedding = cur_obs_embedding.view(batch_size, seq_length, -1) - episodic_reward = [[] for _ in range(batch_size)] - null_cnt = 0 # the number of null transitions in the whole minibatch - for i in range(batch_size): - for j in range(seq_length): - if j < 10: - # if self._running_mean_std_episodic_reward.mean is not None: - # episodic_reward[i].append(torch.tensor(self._running_mean_std_episodic_reward.mean).to(self.device)) - # else: - episodic_reward[i].append(torch.tensor(0.).to(self.device)) - elif j: - episodic_memory = cur_obs_embedding[i][:j] - reward = self._compute_intrinsic_reward(episodic_memory, - cur_obs_embedding[i][j]).to(self.device) - episodic_reward[i].append(reward) - - if torch.nonzero(torch.tensor(is_null[i]).float()).shape[0] != 0: - # TODO(pu): if have null padding, the episodic_reward should be 0 - not_null_index = torch.nonzero(torch.tensor(is_null[i]).float()).squeeze(-1) - null_start_index = int(torch.nonzero(torch.tensor(is_null[i]).float()).squeeze(-1)[0]) - # add the number of null transitions in i'th sequence in batch - null_cnt = null_cnt + seq_length - null_start_index - for k in range(null_start_index, seq_length): - episodic_reward[i][k] = torch.tensor(0).to(self.device) - # episodic_reward[i][null_start_index:-1]=[torch.tensor(0).to(self.device) - # for i in range(seq_length-null_start_index)] - - # list(list(tensor)) -> tensor - tmp = [torch.stack(episodic_reward_tmp, dim=0) for episodic_reward_tmp in episodic_reward] - # stack batch dim - episodic_reward = torch.stack(tmp, dim=0) # TODO(pu): image case - episodic_reward = episodic_reward.view(-1) # torch.Size([32, 42]) -> torch.Size([32*42] - - episodic_reward_real_mean = sum(episodic_reward) / ( - batch_size * seq_length - null_cnt - ) # TODO(pu): recompute mean + episodic_reward, episodic_reward_real_mean = self.episodic_reward_model.forward(obs, is_null) self.estimate_cnt_episodic += 1 - self._running_mean_std_episodic_reward.update(episodic_reward.cpu().numpy()) self.tb_logger.add_scalar( 'episodic_reward/episodic_reward_max', episodic_reward.max(), self.estimate_cnt_episodic @@ -541,3 +409,157 @@ def fusion_reward( int(data[i]['beta'][j])] return data, estimate_cnt + + +@REWARD_MODEL_REGISTRY.register('ngu-reward') +class NGURewardModel(BaseRewardModel): + """ + Overview: + The unifying reward for ngu which combined rnd-ngu and episodic + The corresponding paper is `never give up: learning directed exploration strategies`. + Interface: + ``__init__``, ``train``, ``estimate``, ``collect_data``, ``clear_data`` + Config: + == ==================== ======== ============= ==================================== ======================= + ID Symbol Type Default Value Description Other(Shape) + == ==================== ======== ============= ==================================== ======================= + 1 ``type`` str ngu-reward | Reward model register name, | + | refer to registry | + | ``REWARD_MODEL_REGISTRY`` | + 2 | ``intrinsic_`` str add | the intrinsic reward type | including add, new + | ``reward_type`` | | , or assign + 3 | ``policy_nstep`` int 1 | the nstep of policy | + 4 | ``nstep`` int 1 | the nstep of reward | + 5 | ``learning_rate`` float 0.001 | The step size of gradient descent | + 6 | ``obs_shape`` Tuple( 4 | the observation shape | + [int, + list]) + 7 | ``action_shape`` int 2 | the action space shape | + 8 | ``batch_size`` int 64 | Training batch size | + 9 | ``hidden`` list [64, 64, | Sequence of ``hidden_size`` | + | ``_size_list`` (int) 128] | of reward network. | + 10 | ``update_per_`` int 100 | Number of updates per collect | + | ``collect`` | | + 11 | ``only_use_`` float 1 | Whether to only use last | + | ``last_five_`` | five frames for ICM/RND. | + | ``frames_for_`` | | + | ``icm_rnd`` | | + 12 | ``last_nonzero_`` bool False | Whether to rescale | used in episode rm + ``reward_rescale`` | last nonzero reward. + 13 | ``last_nonzero_`` int 1 | The weight of last nonzero reward. | used in episode rm + ``reward_weight`` | | + 14 | ``clear_buffer`` int 1 | clear buffer per fixed iters | make sure replay + ``_per_iters`` | buffer's data count + | isn't too few. + | (code work in entry) + == ==================== ======== ============= ==================================== ======================= + """ + config = dict( + # (str) Reward model register name, refer to registry ``REWARD_MODEL_REGISTRY``. + type='ngu-reward', + # (int) nstep for rl algorithms used in episode reward model + policy_nstep=5, + # (int) number of envs for collecting data + collect_env_num=8, + rnd_reward_model=dict( + # (str) The intrinsic reward type, including add, new, or assign. + intrinsic_reward_type='add', + # (float) The step size of gradient descent. + learning_rate=5e-4, + # (Tuple[int, list]) The observation shape. + obs_shape=4, + # (int) The action space shape. + action_shape=2, + # (int) Training batch size. + batch_size=128, # transitions + # (int) Number of updates per collect. + update_per_collect=10, + # (bool) Whether to only use last five frames for ICM/RND. + only_use_last_five_frames_for_icm_rnd=False, + # (int) Clear buffer per fixed iters, make sure replay buffer's data count isn't too few. + clear_buffer_per_iters=10, + # (int) nstep for rl algorithms used in rnd reward model + nstep=5, + # (list(int)) Sequence of ``hidden_size`` of reward network. + # If obs.shape == 1, use MLP layers. + # If obs.shape == 3, use conv layer and final dense layer. + hidden_size_list=[128, 128, 64], + # (str) The reward model type, which must be rnd-ngu + type='rnd-ngu', + ), + episodic_reward_model=dict( + # (bool) Whether to rescale last nonzero reward. + last_nonzero_reward_rescale=False, + # (int) The weight of last nonzero reward. + last_nonzero_reward_weight=1, + # (str) The intrinsic reward type, including add, new, or assign. + intrinsic_reward_type='add', + # (float) The step size of gradient descent. + learning_rate=5e-4, + # (Tuple[int, list]) The observation shape. + obs_shape=4, + # (int) The action space shape. + action_shape=2, + # (int) Training batch size. + batch_size=128, # transitions + # (int) Number of updates per collect. + update_per_collect=10, + # (bool) Whether to only use last five frames for ICM/RND. + only_use_last_five_frames_for_icm_rnd=False, + # (int) Clear buffer per fixed iters, make sure replay buffer's data count isn't too few. + clear_buffer_per_iters=10, + # (int) nstep for rl algorithms used in episode reward model + nstep=5, + # (list(int)) Sequence of ``hidden_size`` of reward network. + hidden_size_list=[128, 128, 64], + # (str) The reward model type, which must be episodic + type='episodic', + ), + ) + + def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> None: + super(NGURewardModel, self).__init__() + self.cfg = config + self.tb_logger = tb_logger + self.estimate_cnt = 0 + self.rnd_reward_model = RndNGURewardModel(config.rnd_reward_model, device, tb_logger) + self.episodic_reward_model = EpisodicNGURewardModel(config.episodic_reward_model, device, tb_logger) + + def train(self) -> None: + self.rnd_reward_model.train() + self.episodic_reward_model.train() + + def estimate(self, data: list) -> dict: + + # estimate reward + rnd_reward = self.rnd_reward_model.estimate(data) + episodic_reward = self.episodic_reward_model.estimate(data) + + # combine reward + train_data_augumented, self.estimate_cnt = self.episodic_reward_model.fusion_reward( + data, + episodic_reward, + rnd_reward, + nstep=self.cfg.policy_nstep, + collector_env_num=self.cfg.collect_env_num, + tb_logger=self.tb_logger, + estimate_cnt=self.estimate_cnt + ) + + return train_data_augumented + + def collect_data(self, data) -> None: + self.rnd_reward_model.collect_data(data) + self.episodic_reward_model.collect_data(data) + + def clear_data(self, iter: int) -> None: + assert hasattr( + self.cfg.rnd_reward_model, 'clear_buffer_per_iters' + ), "RND Reward Model does not have clear_buffer_per_iters, Clear failed" + assert hasattr( + self.cfg.episodic_reward_model, 'clear_buffer_per_iters' + ), "Episodic Reward Model does not have clear_buffer_per_iters, Clear failed" + if iter % self.cfg.rnd_reward_model.clear_buffer_per_iters == 0: + self.rnd_reward_model.clear_data() + if iter % self.cfg.episodic_reward_model.clear_buffer_per_iters == 0: + self.episodic_reward_model.clear_data() diff --git a/ding/reward_model/pdeil_irl_model.py b/ding/reward_model/pdeil_irl_model.py index b09416f5c2..9cf37d84b8 100644 --- a/ding/reward_model/pdeil_irl_model.py +++ b/ding/reward_model/pdeil_irl_model.py @@ -172,7 +172,7 @@ def estimate(self, data: list) -> List[Dict]: s = torch.stack([item['obs'] for item in train_data_augmented], dim=0) a = torch.stack([item['action'] for item in train_data_augmented], dim=0) if self.p_u_s is None: - print("you need to train you reward model first") + logging.warning("you need to train you reward model first") for item in train_data_augmented: item['reward'].zero_() else: @@ -218,10 +218,14 @@ def collect_data(self, item: list): """ self.train_data.extend(item) - def clear_data(self): + def clear_data(self, iter: int): """ Overview: Clearing training data. \ This is a side effect function which clears the data attribute in ``self`` """ - self.train_data.clear() + assert hasattr( + self.cfg, 'clear_buffer_per_iters' + ), "Reward Model does not have clear_buffer_per_iters, Clear failed" + if iter % self.cfg.clear_buffer_per_iters == 0: + self.train_data.clear() diff --git a/ding/reward_model/pwil_irl_model.py b/ding/reward_model/pwil_irl_model.py index 5fec46b821..5ee00eee5b 100644 --- a/ding/reward_model/pwil_irl_model.py +++ b/ding/reward_model/pwil_irl_model.py @@ -1,4 +1,5 @@ from typing import Dict, List +from ditk import logging import math import random import pickle @@ -6,6 +7,7 @@ from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel +from .reward_model_utils import concat_state_action_pairs def collect_state_action_pairs(iterator): @@ -16,13 +18,12 @@ def collect_state_action_pairs(iterator): Arguments: - iterator (:obj:`Iterable`): Iterables with at least ``obs`` and ``action`` tensor keys. Returns: - - res (:obj:`Torch.tensor`): State and action pairs. + - res (:obj:`List(Tuple(torch.Tensor, torch.Tensor))`): State and action pairs. """ res = [] for item in iterator: state = item['obs'] action = item['action'] - # s_a = torch.cat([state, action.float()], dim=-1) res.append((state, action)) return res @@ -95,6 +96,7 @@ def __init__(self, config: Dict, device: str, tb_logger: 'SummaryWriter') -> Non self.cfg: Dict = config assert device in ["cpu", "cuda"] or "cuda" in device self.device = device + self.tb_logger = tb_logger self.expert_data: List[tuple] = [] self.train_data: List[tuple] = [] # In this algo, model is a dict @@ -114,12 +116,12 @@ def load_expert_data(self) -> None: """ with open(self.cfg.expert_data_path, 'rb') as f: self.expert_data = pickle.load(f) - print("the data size is:", len(self.expert_data)) + logging.info("the data size is: %d", len(self.expert_data)) sample_size = min(self.cfg.sample_size, len(self.expert_data)) self.expert_data = random.sample(self.expert_data, sample_size) self.expert_data = [(item['obs'], item['action']) for item in self.expert_data] self.expert_s, self.expert_a = list(zip(*self.expert_data)) - print('the expert data demonstrations is:', len(self.expert_data)) + logging.info('the expert data demonstrations is: %d', len(self.expert_data)) def collect_data(self, data: list) -> None: """ @@ -253,10 +255,14 @@ def _train(self, data: list): reward = self.cfg.alpha * math.exp(self.reward_factor * c) self.reward_table[(s, a)] = torch.FloatTensor([reward]) - def clear_data(self) -> None: + def clear_data(self, iter: int) -> None: """ Overview: Clearing training data. \ This is a side effect function which clears the data attribute in ``self`` """ - self.train_data.clear() + assert hasattr( + self.cfg, 'clear_buffer_per_iters' + ), "Reward Model does not have clear_buffer_per_iters, Clear failed" + if iter % self.cfg.clear_buffer_per_iters == 0: + self.train_data.clear() diff --git a/ding/reward_model/red_irl_model.py b/ding/reward_model/red_irl_model.py index a7daeeceec..91afbcf145 100644 --- a/ding/reward_model/red_irl_model.py +++ b/ding/reward_model/red_irl_model.py @@ -1,29 +1,13 @@ from typing import Dict, List import pickle import random -import torch -import torch.nn as nn + import torch.optim as optim from ding.utils import REWARD_MODEL_REGISTRY, one_time_warning from .base_reward_model import BaseRewardModel - - -class SENet(nn.Module): - """support estimation network""" - - def __init__(self, input_size: int, hidden_size: int, output_dims: int) -> None: - super(SENet, self).__init__() - self.l_1 = nn.Linear(input_size, hidden_size) - self.l_2 = nn.Linear(hidden_size, output_dims) - self.act = nn.Tanh() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - out = self.l_1(x) - out = self.act(out) - out = self.l_2(out) - out = self.act(out) - return out +from .network import REDNetwork +from .reward_model_utils import concat_state_action_pairs @REWARD_MODEL_REGISTRY.register('red') @@ -35,38 +19,39 @@ class RedRewardModel(BaseRewardModel): ``estimate``, ``train``, ``load_expert_data``, ``collect_data``, ``clear_date``, \ ``__init__``, ``_train`` Config: - == ================== ===== ============= ======================================= ======================= - ID Symbol Type Default Value Description Other(Shape) - == ================== ===== ============= ======================================= ======================= - 1 ``type`` str red | Reward model register name, refer | - | to registry ``REWARD_MODEL_REGISTRY`` | - 2 | ``expert_data_`` str expert_data | Path to the expert dataset | Should be a '.pkl' - | ``path`` .pkl | | file - 3 | ``sample_size`` int 1000 | sample data from expert dataset | - | with fixed size | - 4 | ``sigma`` int 5 | hyperparameter of r(s,a) | r(s,a) = exp( + == ================== ====== ============= ======================================= ======================= + ID Symbol Type Default Value Description Other(Shape) + == ================== ====== ============= ======================================= ======================= + 1 ``type`` str red | Reward model register name, refer | + | to registry ``REWARD_MODEL_REGISTRY`` | + 2 | ``expert_data_`` str expert_data | Path to the expert dataset | Should be a '.pkl' + | ``path`` .pkl | | file + 3 | ``sample_size`` int 1000 | sample data from expert dataset | + | with fixed size | + 4 | ``sigma`` int 5 | hyperparameter of r(s,a) | r(s,a) = exp( | -sigma* L(s,a)) - 5 | ``batch_size`` int 64 | Training batch size | - 6 | ``hidden_size`` int 128 | Linear model hidden size | - 7 | ``update_per_`` int 100 | Number of updates per collect | - | ``collect`` | | - 8 | ``clear_buffer`` int 1 | clear buffer per fixed iters | make sure replay + 5 | ``batch_size`` int 64 | Training batch size | + 6 | ``hidden`` list [64, 64, | Sequence of ``hidden_size`` | + | ``_size_list`` (int) 128] | of reward network | + 7 | ``update_per_`` int 100 | Number of updates per collect | + | ``collect`` | | + 8 | ``clear_buffer`` int 1 | clear buffer per fixed iters | make sure replay ``_per_iters`` | buffer's data count | isn't too few. | (code work in entry) - == ================== ===== ============= ======================================= ======================= - Properties: - - online_net (:obj: `SENet`): The reward model, in default initialized once as the training begins. + == ================== ====== ============= ======================================= ======================= """ config = dict( # (str) Reward model register name, refer to registry ``REWARD_MODEL_REGISTRY``. type='red', - # (int) Linear model input size. - # input_size=4, + # (int) observation shape + # obs_shape=4, + # (int) action shape + # action_shape=1, # (int) Sample data from expert dataset with fixed size. sample_size=1000, - # (int) Linear model hidden size. - hidden_size=128, + # (list(int)) Sequence of ``hidden_size`` of reward network. + hidden_size_list=[128, 1], # (float) The step size of gradient descent. learning_rate=1e-3, # (int) How many updates(iterations) to train after collector's one collection. @@ -99,11 +84,9 @@ def __init__(self, config: Dict, device: str, tb_logger: 'SummaryWriter') -> Non self.device = device assert device in ["cpu", "cuda"] or "cuda" in device self.tb_logger = tb_logger - self.target_net: SENet = SENet(config.input_size, config.hidden_size, 1) - self.online_net: SENet = SENet(config.input_size, config.hidden_size, 1) - self.target_net.to(device) - self.online_net.to(device) - self.opt: optim.Adam = optim.Adam(self.online_net.parameters(), config.learning_rate) + self.reward_model = REDNetwork(config.obs_shape, config.action_shape, config.hidden_size_list, config.sigma) + self.reward_model.to(self.device) + self.opt = optim.Adam(self.reward_model.predictor.parameters(), config.learning_rate) self.train_once_flag = False self.load_expert_data() @@ -121,19 +104,18 @@ def load_expert_data(self) -> None: self.expert_data = random.sample(self.expert_data, sample_size) print('the expert data size is:', len(self.expert_data)) - def _train(self, batch_data: torch.Tensor) -> float: + def _train(self) -> float: """ Overview: Helper function for ``train`` which caclulates loss for train data and expert data. - Arguments: - - batch_data (:obj:`torch.Tensor`): Data used for training Returns: - - Combined loss calculated of reward model from using ``batch_data`` in both target and reward models. + - Combined loss calculated of reward model from using ``states_actions_tensor``. """ - with torch.no_grad(): - target = self.target_net(batch_data) - hat: torch.Tensor = self.online_net(batch_data) - loss: torch.Tensor = ((hat - target) ** 2).mean() + sample_batch = random.sample(self.expert_data, self.cfg.batch_size) + states_actions_tensor = concat_state_action_pairs(sample_batch) + states_actions_tensor = states_actions_tensor.to(self.device) + loss = self.reward_model.learn(states_actions_tensor) + # loss = F.mse_loss(predict_feature, target_feature.detach()) self.opt.zero_grad() loss.backward() self.opt.step() @@ -150,17 +132,7 @@ def train(self) -> None: one_time_warning('RED model should be trained once, we do not train it anymore') else: for i in range(self.cfg.update_per_collect): - sample_batch = random.sample(self.expert_data, self.cfg.batch_size) - states_data = [] - actions_data = [] - for item in sample_batch: - states_data.append(item['obs']) - actions_data.append(item['action']) - states_tensor: torch.Tensor = torch.stack(states_data).float() - actions_tensor: torch.Tensor = torch.stack(actions_data).float() - states_actions_tensor: torch.Tensor = torch.cat([states_tensor, actions_tensor], dim=1) - states_actions_tensor = states_actions_tensor.to(self.device) - loss = self._train(states_actions_tensor) + loss = self._train() self.tb_logger.add_scalar('reward_model/red_loss', loss, i) self.train_once_flag = True @@ -177,20 +149,9 @@ def estimate(self, data: list) -> List[Dict]: # NOTE: deepcopy reward part of data is very important, # otherwise the reward of data in the replay buffer will be incorrectly modified. train_data_augmented = self.reward_deepcopy(data) - states_data = [] - actions_data = [] - for item in train_data_augmented: - states_data.append(item['obs']) - actions_data.append(item['action']) - states_tensor = torch.stack(states_data).float() - actions_tensor = torch.stack(actions_data).float() - states_actions_tensor = torch.cat([states_tensor, actions_tensor], dim=1) + states_actions_tensor = concat_state_action_pairs(train_data_augmented) states_actions_tensor = states_actions_tensor.to(self.device) - with torch.no_grad(): - hat_1 = self.online_net(states_actions_tensor) - hat_2 = self.target_net(states_actions_tensor) - c = ((hat_1 - hat_2) ** 2).mean(dim=1) - r = torch.exp(-self.cfg.sigma * c) + r = self.reward_model.forward(states_actions_tensor) for item, rew in zip(train_data_augmented, r): item['reward'] = rew return train_data_augmented @@ -204,7 +165,7 @@ def collect_data(self, data) -> None: # if online_net is trained continuously, there should be some implementations in collect_data method pass - def clear_data(self): + def clear_data(self, iter: int): """ Overview: Collecting clearing data, not implemented if reward model (i.e. online_net) is only trained ones, \ diff --git a/ding/reward_model/reward_model_utils.py b/ding/reward_model/reward_model_utils.py new file mode 100644 index 0000000000..91cdd85cfa --- /dev/null +++ b/ding/reward_model/reward_model_utils.py @@ -0,0 +1,106 @@ +from typing import Optional, List, Any +from collections.abc import Iterable +from easydict import EasyDict + +import torch + +from ding.torch_utils import one_hot +from ding.utils import RunningMeanStd +from ding.torch_utils.data_helper import to_tensor + + +def concat_state_action_pairs( + data: list, action_size: Optional[int] = None, one_hot_: Optional[bool] = False +) -> torch.Tensor: + """ + Overview: + Concatenate state and action pairs from input. + Arguments: + - data (:obj:`List`): List with at least ``obs`` and ``action`` keys. + Returns: + - state_actions_tensor (:obj:`Torch.tensor`): State and action pairs. + """ + states_data = [] + actions_data = [] + #check data(dict) has key obs and action + assert isinstance(data, Iterable), "data should be Iterable" + assert "obs" in data[0] and "action" in data[0], "data member must contain key 'obs' and 'action' " + for item in data: + states_data.append(item['obs'].flatten()) + if one_hot_ and action_size: + action = one_hot(torch.Tensor(item['action']).long(), action_size).squeeze(dim=0) + actions_data.append(action) + else: + actions_data.append(item['action']) + + states_tensor: torch.Tensor = torch.stack(states_data).float() + actions_tensor: torch.Tensor = torch.stack(actions_data).float() + states_actions_tensor: torch.Tensor = torch.cat([states_tensor, actions_tensor], dim=1) + + return states_actions_tensor + + +def combine_intrinsic_exterinsic_reward( + train_data_augmented: Any, intrinsic_reward: List[torch.Tensor], config: EasyDict +) -> Any: + """ + Overview: + Concatenate intrinsic and extrinsic reward. + Arguments: + - train_data_augmented (:obj:`List`): List with at least ``reward`` keys. + - intrinsic_reward(:obj:`List`): List which each item is the intrinsic reward + - config:(:obj:EasyDict) : self.config which must include intrinsic_reward_type + Returns: + - train_data_augmented (:obj:`List`): List with at least ``reward`` keys. + """ + for item, in_rew in zip(train_data_augmented, intrinsic_reward): + if config.intrinsic_reward_type == 'add': + if config.extrinsic_reward_norm: + item['reward' + ] = item['reward'] / config.extrinsic_reward_norm_max + in_rew * config.intrinsic_reward_weight + else: + item['reward'] = item['reward'] + in_rew * config.intrinsic_reward_weight + elif config.intrinsic_reward_type == 'new': + item['intrinsic_reward'] = in_rew + if config.extrinsic_reward_norm: + item['reward'] = item['reward'] / config.extrinsic_reward_norm_max + elif config.intrinsic_reward_type == 'assign': + item['reward'] = in_rew + return train_data_augmented + + +def collect_states(iterator) -> List: + """ + Overview: + collect state from data list(dict) + Arguments: + - iterator(:obj:`List`): List with at least ``obs`` keys. + Returns: + - res (:obj:`List`): List of obs. + """ + res = [] + for item in iterator: + state = item['obs'] + res.append(state) + return res + + +def obs_norm( + train_data: torch.Tensor, running_mean_std_obs: RunningMeanStd, config: EasyDict, device: str +) -> torch.Tensor: + """ + Overview: + transform obs to mean 0, std 1, move norm obs to the specific device + Arguments: + - train_data (:obj:`Tensor`): Tensor of obs + - running_mean_std_obs(:obj:RunningMeanStd): RunningMeanStd for obs + - config:(:obj:EasyDict) : self.config which must include obs_norm_clamp_max, obs_norm_clamp_min + Returns: + - train_data (:obj: Tensor`): Tensor of norm obs + """ + running_mean_std_obs.update(train_data.cpu().numpy()) + train_data = (train_data - to_tensor(running_mean_std_obs.mean).to(device)) / to_tensor(running_mean_std_obs.std + ).to(device) + train_data = torch.clamp(train_data, min=config.obs_norm_clamp_min, max=config.obs_norm_clamp_max) + + return train_data diff --git a/ding/reward_model/rnd_reward_model.py b/ding/reward_model/rnd_reward_model.py index 00bb1542fd..ca153492ae 100644 --- a/ding/reward_model/rnd_reward_model.py +++ b/ding/reward_model/rnd_reward_model.py @@ -1,53 +1,18 @@ -from typing import Union, Tuple, List, Dict +from typing import List, Dict from easydict import EasyDict import random import torch -import torch.nn as nn import torch.optim as optim -import torch.nn.functional as F -from ding.utils import SequenceType, REWARD_MODEL_REGISTRY -from ding.model import FCEncoder, ConvEncoder +from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel +from .reward_model_utils import combine_intrinsic_exterinsic_reward, collect_states, obs_norm +from .network import RNDNetwork from ding.utils import RunningMeanStd -from ding.torch_utils.data_helper import to_tensor import numpy as np -def collect_states(iterator): - res = [] - for item in iterator: - state = item['obs'] - res.append(state) - return res - - -class RndNetwork(nn.Module): - - def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType) -> None: - super(RndNetwork, self).__init__() - if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.target = FCEncoder(obs_shape, hidden_size_list) - self.predictor = FCEncoder(obs_shape, hidden_size_list) - elif len(obs_shape) == 3: - self.target = ConvEncoder(obs_shape, hidden_size_list) - self.predictor = ConvEncoder(obs_shape, hidden_size_list) - else: - raise KeyError( - "not support obs_shape for pre-defined encoder: {}, please customize your own RND model". - format(obs_shape) - ) - for param in self.target.parameters(): - param.requires_grad = False - - def forward(self, obs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - predict_feature = self.predictor(obs) - with torch.no_grad(): - target_feature = self.target(obs) - return predict_feature, target_feature - - @REWARD_MODEL_REGISTRY.register('rnd') class RndRewardModel(BaseRewardModel): """ @@ -126,38 +91,32 @@ def __init__(self, config: EasyDict, device: str = 'cpu', tb_logger: 'SummaryWri from tensorboardX import SummaryWriter tb_logger = SummaryWriter('rnd_reward_model') self.tb_logger = tb_logger - self.reward_model = RndNetwork(config.obs_shape, config.hidden_size_list) + self.reward_model = RNDNetwork(config.obs_shape, config.hidden_size_list) self.reward_model.to(self.device) self.intrinsic_reward_type = config.intrinsic_reward_type assert self.intrinsic_reward_type in ['add', 'new', 'assign'] self.train_obs = [] self.opt = optim.Adam(self.reward_model.predictor.parameters(), config.learning_rate) - self._running_mean_std_rnd_reward = RunningMeanStd(epsilon=1e-4) self.estimate_cnt_rnd = 0 self.train_cnt_icm = 0 self._running_mean_std_rnd_obs = RunningMeanStd(epsilon=1e-4) - def _train(self) -> None: + def _train(self) -> torch.Tensor: train_data: list = random.sample(self.train_obs, self.cfg.batch_size) train_data: torch.Tensor = torch.stack(train_data).to(self.device) if self.cfg.obs_norm: - # Note: observation normalization: transform obs to mean 0, std 1 - self._running_mean_std_rnd_obs.update(train_data.cpu().numpy()) - train_data = (train_data - to_tensor(self._running_mean_std_rnd_obs.mean).to(self.device)) / to_tensor( - self._running_mean_std_rnd_obs.std - ).to(self.device) - train_data = torch.clamp(train_data, min=self.cfg.obs_norm_clamp_min, max=self.cfg.obs_norm_clamp_max) - - predict_feature, target_feature = self.reward_model(train_data) - loss = F.mse_loss(predict_feature, target_feature.detach()) - self.tb_logger.add_scalar('rnd_reward/loss', loss, self.train_cnt_icm) + train_data = obs_norm(train_data, self._running_mean_std_rnd_obs, self.cfg, self.device) + loss = self.reward_model.learn(train_data) self.opt.zero_grad() loss.backward() self.opt.step() + return loss + def train(self) -> None: for _ in range(self.cfg.update_per_collect): - self._train() + loss = self._train() + self.tb_logger.add_scalar('rnd_reward/loss', loss, self.train_cnt_icm) self.train_cnt_icm += 1 def estimate(self, data: list) -> List[Dict]: @@ -171,19 +130,10 @@ def estimate(self, data: list) -> List[Dict]: obs = collect_states(train_data_augmented) obs = torch.stack(obs).to(self.device) if self.cfg.obs_norm: - # Note: observation normalization: transform obs to mean 0, std 1 - obs = (obs - to_tensor(self._running_mean_std_rnd_obs.mean - ).to(self.device)) / to_tensor(self._running_mean_std_rnd_obs.std).to(self.device) - obs = torch.clamp(obs, min=self.cfg.obs_norm_clamp_min, max=self.cfg.obs_norm_clamp_max) + obs = obs_norm(obs, self._running_mean_std_rnd_obs, self.cfg, self.device) + rnd_reward = self.reward_model.forward(obs) with torch.no_grad(): - predict_feature, target_feature = self.reward_model(obs) - mse = F.mse_loss(predict_feature, target_feature, reduction='none').mean(dim=1) - self._running_mean_std_rnd_reward.update(mse.cpu().numpy()) - - # Note: according to the min-max normalization, transform rnd reward to [0,1] - rnd_reward = (mse - mse.min()) / (mse.max() - mse.min() + 1e-8) - # save the rnd_reward statistics into tb_logger self.estimate_cnt_rnd += 1 self.tb_logger.add_scalar('rnd_reward/rnd_reward_max', rnd_reward.max(), self.estimate_cnt_rnd) @@ -200,19 +150,7 @@ def estimate(self, data: list) -> List[Dict]: # rewards = torch.stack([data[i]['reward'] for i in range(len(data))]) # rewards = (rewards - torch.min(rewards)) / (torch.max(rewards) - torch.min(rewards)) - for item, rnd_rew in zip(train_data_augmented, rnd_reward): - if self.intrinsic_reward_type == 'add': - if self.cfg.extrinsic_reward_norm: - item['reward'] = item[ - 'reward'] / self.cfg.extrinsic_reward_norm_max + rnd_rew * self.cfg.intrinsic_reward_weight - else: - item['reward'] = item['reward'] + rnd_rew * self.cfg.intrinsic_reward_weight - elif self.intrinsic_reward_type == 'new': - item['intrinsic_reward'] = rnd_rew - if self.cfg.extrinsic_reward_norm: - item['reward'] = item['reward'] / self.cfg.extrinsic_reward_norm_max - elif self.intrinsic_reward_type == 'assign': - item['reward'] = rnd_rew + train_data_augmented = combine_intrinsic_exterinsic_reward(train_data_augmented, rnd_reward, self.cfg) # save the augmented_reward statistics into tb_logger rew = [item['reward'].cpu().numpy() for item in train_data_augmented] @@ -225,8 +163,14 @@ def estimate(self, data: list) -> List[Dict]: def collect_data(self, data: list) -> None: self.train_obs.extend(collect_states(data)) - def clear_data(self) -> None: - self.train_obs.clear() + def clear_data(self, iter: int) -> None: + assert hasattr( + self.cfg, 'clear_buffer_per_iters' + ), "Reward Model does not have clear_buffer_per_iters, \ + if you want to clear buffer, you need to add this attribute in config." + + if iter % self.cfg.clear_buffer_per_iters == 0: + self.train_obs.clear() def state_dict(self) -> Dict: return self.reward_model.state_dict() diff --git a/ding/reward_model/tests/test_gail_irl_model.py b/ding/reward_model/tests/test_gail_irl_model.py index bac9d3d950..20bf21a7de 100644 --- a/ding/reward_model/tests/test_gail_irl_model.py +++ b/ding/reward_model/tests/test_gail_irl_model.py @@ -23,21 +23,25 @@ cfg1 = dict( input_size=obs_space_1d + 1, - hidden_size=64, + hidden_size_list=[64], batch_size=5, learning_rate=1e-3, update_per_collect=2, data_path=expert_data_path_1d, + clear_buffer_per_iters=1, ), cfg2 = dict( input_size=obs_space_3d, - hidden_size=64, + hidden_size_list=[16, 16, 16, 16, 64], + kernel_size=[7, 5, 3, 3], + stride=[3, 2, 2, 1], batch_size=5, learning_rate=1e-3, update_per_collect=2, data_path=expert_data_path_3d, action_size=action_space, + clear_buffer_per_iters=1, ), # create fake expert dataset @@ -75,7 +79,7 @@ def test_dataset_1d(cfg): policy.train() train_data_augmented = policy.estimate(data) assert 'reward' in train_data_augmented[0].keys() - policy.clear_data() + policy.clear_data(iter=1) assert len(policy.train_data) == 0 os.popen('rm -rf {}'.format(expert_data_path_1d)) @@ -99,6 +103,6 @@ def test_dataset_3d(cfg): policy.train() train_data_augmented = policy.estimate(data) assert 'reward' in train_data_augmented[0].keys() - policy.clear_data() + policy.clear_data(iter=1) assert len(policy.train_data) == 0 os.popen('rm -rf {}'.format(expert_data_path_3d)) diff --git a/ding/reward_model/tests/test_reward_model_network.py b/ding/reward_model/tests/test_reward_model_network.py new file mode 100644 index 0000000000..c1622dcadf --- /dev/null +++ b/ding/reward_model/tests/test_reward_model_network.py @@ -0,0 +1,25 @@ +from collections.abc import Iterable + +import torch +import pytest +import torch.optim as optim +import torch.nn.functional as F + +from ding.reward_model.network import RepresentationNetwork + + +@pytest.mark.unittest +def test_representation_network(): + # len(obs_shape) == 3 + obs_shape = [4, 84, 84] + batch_size = 32 + hidden_size_list = [16, 16, 16, 16] + reward_model = RepresentationNetwork(obs_shape, hidden_size_list) + data = torch.randn([batch_size] + obs_shape) + data_feature = reward_model(data) + assert data_feature.shape == (batch_size, hidden_size_list[-1]) + + # len(obs_shape) == 4 + with pytest.raises(KeyError): + obs_shape = [4, 84, 84, 5] + reward_model = RepresentationNetwork(obs_shape, hidden_size_list) diff --git a/ding/reward_model/tests/test_reward_model_utils.py b/ding/reward_model/tests/test_reward_model_utils.py new file mode 100644 index 0000000000..a765969e62 --- /dev/null +++ b/ding/reward_model/tests/test_reward_model_utils.py @@ -0,0 +1,71 @@ +import pytest +import torch + +from easydict import EasyDict +from ding.reward_model.reward_model_utils import concat_state_action_pairs, combine_intrinsic_exterinsic_reward + + +@pytest.mark.unittest +def test_concat_state_action_pairs(): + data = [{'obs': torch.rand(3), 'action': torch.randint(0, 4, size=(1, ))} for i in range(10)] + states_actions_tensor = concat_state_action_pairs(data) + states_actions_test = [] + for item in data: + state = item['obs'].flatten() + action = item['action'] + s_a = torch.cat([state, action.float()], dim=-1) + states_actions_test.append(s_a) + states_actions_tensor_test = torch.stack(states_actions_test) + assert states_actions_tensor.equal(states_actions_tensor_test) + + +@pytest.mark.unittest +def test_concat_state_action_pairs_one_hot(): + data = [{'obs': torch.rand(3), 'action': torch.randint(0, 4, size=(1, ))} for i in range(10)] + action_size = 5 + states_actions_tensor = concat_state_action_pairs(data, action_size, True) + states_actions_test = [] + for item in data: + state = item['obs'].flatten() + action = item['action'] + action = torch.Tensor([int(i == action) for i in range(action_size)]) + s_a = torch.cat([state, action], dim=-1) + states_actions_test.append(s_a) + states_actions_tensor_test = torch.stack(states_actions_test) + assert states_actions_tensor.equal(states_actions_tensor_test) + + +@pytest.mark.unittest +def test_combine_intrinsic_exterinsic_reward(): + intrinsic_reward = torch.rand(1) + train_data_augument = [{'obs': torch.rand(5), 'reward': torch.rand(1), 'intrinsic_reward': torch.rand(1)}] + extrinsic_reward = train_data_augument[0]['reward'] + config_list = [ + { + 'intrinsic_reward_type': 'add', + 'intrinsic_reward_weight': 1, + 'extrinsic_reward_norm_max': 1, + 'extrinsic_reward_norm': False, + }, { + 'intrinsic_reward_type': 'new', + 'intrinsic_reward_weight': 1, + 'extrinsic_reward_norm_max': 1, + 'extrinsic_reward_norm': True, + }, { + 'intrinsic_reward_type': 'assign', + 'intrinsic_reward_weight': 1, + 'extrinsic_reward_norm_max': 1, + 'extrinsic_reward_norm': True, + } + ] + for config in config_list: + config = EasyDict(config) + train_data_augument = combine_intrinsic_exterinsic_reward(train_data_augument, intrinsic_reward, config) + for item in train_data_augument: + if config.intrinsic_reward_type == 'add': + real_reward = intrinsic_reward + extrinsic_reward + assert item['reward'].equal(real_reward) + elif config.intrinsic_reward_type == 'new': + assert item['intrinsic_reward'].equal(intrinsic_reward.squeeze()) + else: + assert item['reward'].equal(intrinsic_reward.squeeze()) diff --git a/ding/reward_model/trex_reward_model.py b/ding/reward_model/trex_reward_model.py index 635dc5e75e..2c41431498 100644 --- a/ding/reward_model/trex_reward_model.py +++ b/ding/reward_model/trex_reward_model.py @@ -1,6 +1,7 @@ from copy import deepcopy -from typing import Tuple, Optional, List, Dict +from typing import Tuple, List, Dict from easydict import EasyDict +from ditk import logging import pickle import os import numpy as np @@ -10,123 +11,12 @@ import torch.optim as optim from ding.utils import REWARD_MODEL_REGISTRY -from ding.utils import SequenceType -from ding.model.common import FCEncoder from ding.utils import build_logger from ding.utils.data import default_collate from .base_reward_model import BaseRewardModel -from .rnd_reward_model import collect_states - - -class TrexConvEncoder(nn.Module): - r""" - Overview: - The ``Convolution Encoder`` used in models. Used to encoder raw 2-dim observation. - Interfaces: - ``__init__``, ``forward`` - """ - - def __init__( - self, - obs_shape: SequenceType, - hidden_size_list: SequenceType = [16, 16, 16, 16, 64, 1], - activation: Optional[nn.Module] = nn.LeakyReLU() - ) -> None: - r""" - Overview: - Init the Trex Convolution Encoder according to arguments. TrexConvEncoder is different \ - from the ConvEncoder in model.common.encoder, their stride and kernel size parameters \ - are different - Arguments: - - obs_shape (:obj:`SequenceType`): Sequence of ``in_channel``, some ``output size`` - - hidden_size_list (:obj:`SequenceType`): The collection of ``hidden_size`` - - activation (:obj:`nn.Module`): - The type of activation to use in the conv ``layers``, - if ``None`` then default set to ``nn.LeakyReLU()`` - """ - super(TrexConvEncoder, self).__init__() - self.obs_shape = obs_shape - self.act = activation - self.hidden_size_list = hidden_size_list - - layers = [] - kernel_size = [7, 5, 3, 3] - stride = [3, 2, 1, 1] - input_size = obs_shape[0] # in_channel - for i in range(len(kernel_size)): - layers.append(nn.Conv2d(input_size, hidden_size_list[i], kernel_size[i], stride[i])) - layers.append(self.act) - input_size = hidden_size_list[i] - layers.append(nn.Flatten()) - self.main = nn.Sequential(*layers) - - flatten_size = self._get_flatten_size() - self.mid = nn.Sequential( - nn.Linear(flatten_size, hidden_size_list[-2]), self.act, - nn.Linear(hidden_size_list[-2], hidden_size_list[-1]) - ) - - def _get_flatten_size(self) -> int: - r""" - Overview: - Get the encoding size after ``self.main`` to get the number of ``in-features`` to feed to ``nn.Linear``. - Arguments: - - x (:obj:`torch.Tensor`): Encoded Tensor after ``self.main`` - Returns: - - outputs (:obj:`torch.Tensor`): Size int, also number of in-feature - """ - test_data = torch.randn(1, *self.obs_shape) - with torch.no_grad(): - output = self.main(test_data) - return output.shape[1] - - def forward(self, x: torch.Tensor) -> torch.Tensor: - r""" - Overview: - Return embedding tensor of the env observation - Arguments: - - x (:obj:`torch.Tensor`): Env raw observation - Returns: - - outputs (:obj:`torch.Tensor`): Embedding tensor - """ - x = self.main(x) - x = self.mid(x) - return x - - -class TrexModel(nn.Module): - - def __init__(self, obs_shape): - super(TrexModel, self).__init__() - if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.encoder = nn.Sequential(FCEncoder(obs_shape, [512, 64]), nn.Linear(64, 1)) - # Conv Encoder - elif len(obs_shape) == 3: - self.encoder = TrexConvEncoder(obs_shape) - else: - raise KeyError( - "not support obs_shape for pre-defined encoder: {}, please customize your own Trex model". - format(obs_shape) - ) - - def cum_return(self, traj: torch.Tensor, mode: str = 'sum') -> Tuple[torch.Tensor, torch.Tensor]: - '''calculate cumulative return of trajectory''' - r = self.encoder(traj) - if mode == 'sum': - sum_rewards = torch.sum(r) - sum_abs_rewards = torch.sum(torch.abs(r)) - return sum_rewards, sum_abs_rewards - elif mode == 'batch': - return r, torch.abs(r) - else: - raise KeyError("not support mode: {}, please choose mode=sum or mode=batch".format(mode)) - - def forward(self, traj_i: torch.Tensor, traj_j: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - '''compute cumulative return for each trajectory and return logits''' - cum_r_i, abs_r_i = self.cum_return(traj_i) - cum_r_j, abs_r_j = self.cum_return(traj_j) - return torch.cat((cum_r_i.unsqueeze(0), cum_r_j.unsqueeze(0)), 0), abs_r_i + abs_r_j +from .reward_model_utils import collect_states +from .network import TREXNetwork @REWARD_MODEL_REGISTRY.register('trex') @@ -179,22 +69,25 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> assert device in ["cpu", "cuda"] or "cuda" in device self.device = device self.tb_logger = tb_logger - self.reward_model = TrexModel(self.cfg.policy.model.obs_shape) + kernel_size = config.kernel_size if 'kernel_size' in config else None + stride = config.stride if 'stride' in config else None + self.reward_model = TREXNetwork(self.cfg.obs_shape, config.hidden_size_list, kernel_size, stride) self.reward_model.to(self.device) self.pre_expert_data = [] self.train_data = [] self.expert_data_loader = None - self.opt = optim.Adam(self.reward_model.parameters(), config.reward_model.learning_rate) + self.opt = optim.Adam(self.reward_model.parameters(), config.learning_rate) self.train_iter = 0 + self.estimate_iter = 0 self.learning_returns = [] self.training_obs = [] self.training_labels = [] - self.num_trajs = self.cfg.reward_model.num_trajs - self.num_snippets = self.cfg.reward_model.num_snippets + self.num_trajs = self.cfg.num_trajs + self.num_snippets = self.cfg.num_snippets # minimum number of short subtrajectories to sample - self.min_snippet_length = config.reward_model.min_snippet_length + self.min_snippet_length = config.min_snippet_length # maximum number of short subtrajectories to sample - self.max_snippet_length = config.reward_model.max_snippet_length + self.max_snippet_length = config.max_snippet_length self.l1_reg = 0 self.data_for_save = {} self._logger, self._tb_logger = build_logger( @@ -216,8 +109,8 @@ def load_expert_data(self) -> None: self.learning_returns = pickle.load(f) self.create_training_data() - self._logger.info("num_training_obs: {}".format(len(self.training_obs))) - self._logger.info("num_labels: {}".format(len(self.training_labels))) + logging.info("num_training_obs: {}".format(len(self.training_obs))) + logging.info("num_labels: {}".format(len(self.training_labels))) def create_training_data(self): num_trajs = self.num_trajs @@ -229,10 +122,10 @@ def create_training_data(self): for i in range(len(self.pre_expert_data)): demo_lengths.append([len(d) for d in self.pre_expert_data[i]]) - self._logger.info("demo_lengths: {}".format(demo_lengths)) + logging.info("demo_lengths: {}".format(demo_lengths)) max_snippet_length = min(np.min(demo_lengths), max_snippet_length) - self._logger.info("min snippet length: {}".format(min_snippet_length)) - self._logger.info("max snippet length: {}".format(max_snippet_length)) + logging.info("min snippet length: {}".format(min_snippet_length)) + logging.info("max snippet length: {}".format(max_snippet_length)) # collect training data max_traj_length = 0 @@ -285,57 +178,59 @@ def create_training_data(self): label = int(bi <= bj) self.training_obs.append((traj_i, traj_j)) self.training_labels.append(label) - self._logger.info(("maximum traj length: {}".format(max_traj_length))) + logging.info(("maximum traj length: {}".format(max_traj_length))) return self.training_obs, self.training_labels - def _train(self): + def _train(self, training_obs: Tuple, training_labels: Tuple) -> float: # check if gpu available device = self.device # torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Assume that we are on a CUDA machine, then this should print a CUDA device: - self._logger.info("device: {}".format(device)) + logging.info("device: {}".format(device)) + cum_loss = 0.0 + for i in range(len(training_labels)): + + # traj_i, traj_j has the same length, however, they change as i increases + traj_i, traj_j = training_obs[i] # traj_i is a list of array generated by env.step + traj_i = np.array(traj_i) + traj_j = np.array(traj_j) + traj_i = torch.from_numpy(traj_i).float().to(device) + traj_j = torch.from_numpy(traj_j).float().to(device) + + # training_labels[i] is a boolean integer: 0 or 1 + labels = torch.tensor([training_labels[i]]).to(device) + + # forward + backward + zero out gradient + optimize + loss = self.reward_model.learn(traj_i, traj_j, labels) + self.opt.zero_grad() + loss.backward() + self.opt.step() + + # print stats to see if learning + item_loss = loss.item() + cum_loss += item_loss + return cum_loss + # if not os.path.exists(os.path.join(self.cfg.exp_name, 'ckpt_reward_model')): + # os.makedirs(os.path.join(self.cfg.exp_name, 'ckpt_reward_model')) + # torch.save(self.reward_model.state_dict(), os.path.join(self.cfg.exp_name, + # 'ckpt_reward_model/latest.pth.tar')) + # logging.info("finished training") + + def train(self): + # check if gpu available + device = self.device # torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + # Assume that we are on a CUDA machine, then this should print a CUDA device: + logging.info("device: {}".format(device)) training_inputs, training_outputs = self.training_obs, self.training_labels - loss_criterion = nn.CrossEntropyLoss() cum_loss = 0.0 training_data = list(zip(training_inputs, training_outputs)) - for epoch in range(self.cfg.reward_model.update_per_collect): # todo + for epoch in range(self.cfg.update_per_collect): np.random.shuffle(training_data) training_obs, training_labels = zip(*training_data) - for i in range(len(training_labels)): - - # traj_i, traj_j has the same length, however, they change as i increases - traj_i, traj_j = training_obs[i] # traj_i is a list of array generated by env.step - traj_i = np.array(traj_i) - traj_j = np.array(traj_j) - traj_i = torch.from_numpy(traj_i).float().to(device) - traj_j = torch.from_numpy(traj_j).float().to(device) - - # training_labels[i] is a boolean integer: 0 or 1 - labels = torch.tensor([training_labels[i]]).to(device) - - # forward + backward + zero out gradient + optimize - outputs, abs_rewards = self.reward_model.forward(traj_i, traj_j) - outputs = outputs.unsqueeze(0) - loss = loss_criterion(outputs, labels) + self.l1_reg * abs_rewards - self.opt.zero_grad() - loss.backward() - self.opt.step() - - # print stats to see if learning - item_loss = loss.item() - cum_loss += item_loss - if i % 100 == 99: - self._logger.info("[epoch {}:{}] loss {}".format(epoch, i, cum_loss)) - self._logger.info("abs_returns: {}".format(abs_rewards)) - cum_loss = 0.0 - self._logger.info("check pointing") - if not os.path.exists(os.path.join(self.cfg.exp_name, 'ckpt_reward_model')): - os.makedirs(os.path.join(self.cfg.exp_name, 'ckpt_reward_model')) - torch.save(self.reward_model.state_dict(), os.path.join(self.cfg.exp_name, 'ckpt_reward_model/latest.pth.tar')) - self._logger.info("finished training") - - def train(self): - self._train() + cum_loss = self._train(training_obs, training_labels) + self.train_iter += 1 + logging.info("[epoch {}] loss {}".format(epoch, cum_loss)) + self.tb_logger.add_scalar("trex_reward/train_loss_iteration", cum_loss, self.train_iter) # print out predicted cumulative returns and actual returns sorted_returns = sorted(self.learning_returns, key=lambda s: s[0]) demonstrations = [ @@ -344,7 +239,7 @@ def train(self): with torch.no_grad(): pred_returns = [self.predict_traj_return(self.reward_model, traj[0]) for traj in demonstrations] for i, p in enumerate(pred_returns): - self._logger.info("{} {} {}".format(i, p, sorted_returns[i][0])) + logging.info("{} {} {}".format(i, p, sorted_returns[i][0])) info = { "demo_length": [len(d[0]) for d in self.pre_expert_data], "min_snippet_length": self.min_snippet_length, @@ -353,18 +248,14 @@ def train(self): "lem_num_labels": len(self.training_labels), "accuracy": self.calc_accuracy(self.reward_model, self.training_obs, self.training_labels), } - self._logger.info( - "accuracy and comparison:\n{}".format('\n'.join(['{}: {}'.format(k, v) for k, v in info.items()])) - ) + logging.info("accuracy and comparison:\n{}".format('\n'.join(['{}: {}'.format(k, v) for k, v in info.items()]))) def predict_traj_return(self, net, traj): device = self.device # torch.set_printoptions(precision=20) # torch.use_deterministic_algorithms(True) with torch.no_grad(): - rewards_from_obs = net.cum_return( - torch.from_numpy(np.array(traj)).float().to(device), mode='batch' - )[0].squeeze().tolist() + rewards_from_obs = net.forward(torch.from_numpy(np.array(traj)).float().to(device)).squeeze().tolist() # rewards_from_obs1 = net.cum_return(torch.from_numpy(np.array([traj[0]])).float().to(device))[0].item() # different precision return sum(rewards_from_obs) # rewards_from_obs is a list of floats @@ -383,7 +274,7 @@ def calc_accuracy(self, reward_network, training_inputs, training_outputs): traj_j = torch.from_numpy(traj_j).float().to(device) #forward to get logits - outputs, abs_return = reward_network.forward(traj_i, traj_j) + outputs, abs_return = reward_network.get_outputs_abs_reward(traj_i, traj_j) _, pred_label = torch.max(outputs, 0) if pred_label.item() == label: num_correct += 1. @@ -412,7 +303,11 @@ def estimate(self, data: list) -> List[Dict]: res = collect_states(train_data_augmented) res = torch.stack(res).to(self.device) with torch.no_grad(): - sum_rewards, sum_abs_rewards = self.reward_model.cum_return(res, mode='batch') + sum_rewards = self.reward_model.forward(res) + self.tb_logger.add_scalar("trex_reward/estimate_reward_mean", sum_rewards.mean().item(), self.train_iter) + self.tb_logger.add_scalar("trex_reward/estimate_reward_std", sum_rewards.std().item(), self.train_iter) + self.tb_logger.add_scalar("trex_reward/estimate_reward_max", sum_rewards.max().item(), self.train_iter) + self.tb_logger.add_scalar("trex_reward/estimate_reward_min", sum_rewards.min().item(), self.train_iter) for item, rew in zip(train_data_augmented, sum_rewards): # TODO optimise this loop as well ? item['reward'] = rew @@ -430,11 +325,12 @@ def collect_data(self, data: list) -> None: """ pass - def clear_data(self) -> None: + def clear_data(self, iter: int) -> None: """ Overview: Clearing training data. \ This is a side effect function which clears the data attribute in ``self`` """ - self.training_obs.clear() - self.training_labels.clear() + if hasattr(self.cfg, 'clear_buffer_per_iters') and iter % self.cfg.clear_buffer_per_iters == 0: + self.training_obs.clear() + self.training_labels.clear() diff --git a/dizoo/atari/config/serial/montezuma/montezuma_ngu_config.py b/dizoo/atari/config/serial/montezuma/montezuma_ngu_config.py deleted file mode 100644 index f82d0424d0..0000000000 --- a/dizoo/atari/config/serial/montezuma/montezuma_ngu_config.py +++ /dev/null @@ -1,127 +0,0 @@ -from easydict import EasyDict - -collector_env_num = 8 -evaluator_env_num = 8 -nstep = 5 -montezuma_ppo_rnd_config = dict( - exp_name='montezuma_ngu_seed0', - env=dict( - collector_env_num=collector_env_num, - evaluator_env_num=evaluator_env_num, - n_evaluator_episode=8, - env_id='MontezumaRevengeNoFrameskip-v4', - #'ALE/MontezumaRevenge-v5' is available. But special setting is needed after gym make. - obs_plus_prev_action_reward=True, # use specific env wrapper for ngu policy - stop_value=int(1e5), - frame_stack=4, - ), - rnd_reward_model=dict( - intrinsic_reward_type='add', - learning_rate=0.001, - obs_shape=[4, 84, 84], - action_shape=18, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=False, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=1, - intrinsic_reward_type='add', - learning_rate=0.001, - obs_shape=[4, 84, 84], - action_shape=18, - batch_size=320, - update_per_collect=10, # 32*100/64=50 - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', - ), - policy=dict( - cuda=True, - on_policy=False, - priority=True, - priority_IS_weight=True, - discount_factor=0.997, - nstep=nstep, - burnin_step=20, - # (int) is the total length of [sequence sample] minus - # the length of burnin part in [sequence sample], - # i.e., = = + - learn_unroll_len=80, # set this key according to the episode length - model=dict( - obs_shape=[4, 84, 84], - action_shape=18, - encoder_hidden_size_list=[128, 128, 512], - collector_env_num=collector_env_num, - ), - learn=dict( - update_per_collect=8, - batch_size=64, - learning_rate=0.0005, - target_update_theta=0.001, - ), - collect=dict( - # NOTE: It is important that set key traj_len_inf=True here, - # to make sure self._traj_len=INF in serial_sample_collector.py. - # In sequence-based policy, for each collect_env, - # we want to collect data of length self._traj_len=INF - # unless the episode enters the 'done' state. - # In each collect phase, we collect a total of sequence samples. - n_sample=32, - traj_len_inf=True, - env_num=collector_env_num, - ), - eval=dict(env_num=evaluator_env_num, ), - other=dict( - eps=dict( - type='exp', - start=0.95, - end=0.05, - decay=1e5, - ), - replay_buffer=dict( - replay_buffer_size=int(2e3), - # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization - alpha=0.6, - # (Float type) How much correction is used: 0 means no correction while 1 means full correction - beta=0.4, - ) - ), - ), -) -montezuma_ppo_rnd_config = EasyDict(montezuma_ppo_rnd_config) -main_config = montezuma_ppo_rnd_config -montezuma_ppo_rnd_create_config = dict( - env=dict( - type='atari', - import_names=['dizoo.atari.envs.atari_env'], - ), - env_manager=dict(type='subprocess'), - policy=dict(type='ngu'), - rnd_reward_model=dict(type='rnd-ngu'), - episodic_reward_model=dict(type='episodic'), -) -montezuma_ppo_rnd_create_config = EasyDict(montezuma_ppo_rnd_create_config) -create_config = montezuma_ppo_rnd_create_config - -if __name__ == "__main__": - from ding.entry import serial_pipeline_reward_model_ngu - serial_pipeline_reward_model_ngu([main_config, create_config], seed=0) diff --git a/dizoo/atari/config/serial/pitfall/pitfall_ngu_config.py b/dizoo/atari/config/serial/pitfall/pitfall_ngu_config.py deleted file mode 100644 index 6e2d76c3a6..0000000000 --- a/dizoo/atari/config/serial/pitfall/pitfall_ngu_config.py +++ /dev/null @@ -1,134 +0,0 @@ -from easydict import EasyDict - -collector_env_num = 32 -evaluator_env_num = 5 -nstep = 5 - -pitfall_ppo_rnd_config = dict( - # Note: - # 1. at least 1e10 timesteps, i.e., 10000 million, the reward may increase, please be patient. - # 2. the larger unroll_lenth and replay buffer size may have better results, but also require more memory. - # exp_name='debug_pitfall_ngu_ul298_er01_n32_rlbs2e4', - # exp_name='debug_pitfall_ngu_ul98_er01_n32_rlbs2e4', - # exp_name='debug_pitfall_ngu_ul40_er01_n32_rlbs2e4', - exp_name='pitfall_ngu_seed0', - env=dict( - collector_env_num=collector_env_num, - evaluator_env_num=evaluator_env_num, - n_evaluator_episode=5, - env_id='PitfallNoFrameskip-v4', - #'ALE/Pitfall-v5' is available. But special setting is needed after gym make. - obs_plus_prev_action_reward=True, # use specific env wrapper for ngu policy - stop_value=int(1e5), - frame_stack=4, - ), - rnd_reward_model=dict( - intrinsic_reward_type='add', # 'assign' - learning_rate=1e-4, - obs_shape=[4, 84, 84], - action_shape=18, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=False, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=1, - intrinsic_reward_type='add', - learning_rate=1e-4, - obs_shape=[4, 84, 84], - action_shape=18, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', - ), - policy=dict( - cuda=True, - on_policy=False, - priority=True, - priority_IS_weight=True, - discount_factor=0.997, - nstep=nstep, - burnin_step=20, - # (int) is the total length of [sequence sample] minus - # the length of burnin part in [sequence sample], - # i.e., = = + - learn_unroll_len=80, # set this key according to the episode length - model=dict( - obs_shape=[4, 84, 84], - action_shape=18, - encoder_hidden_size_list=[128, 128, 512], - collector_env_num=collector_env_num, - ), - learn=dict( - update_per_collect=8, - batch_size=64, - learning_rate=0.0005, - target_update_theta=0.001, - ), - collect=dict( - # NOTE: It is important that set key traj_len_inf=True here, - # to make sure self._traj_len=INF in serial_sample_collector.py. - # In sequence-based policy, for each collect_env, - # we want to collect data of length self._traj_len=INF - # unless the episode enters the 'done' state. - # In each collect phase, we collect a total of sequence samples. - n_sample=32, - traj_len_inf=True, - env_num=collector_env_num, - ), - eval=dict(env_num=evaluator_env_num, ), - other=dict( - eps=dict( - type='exp', - start=0.95, - end=0.05, - decay=1e5, - ), - replay_buffer=dict( - replay_buffer_size=int(3e3), - # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization - alpha=0.6, - # (Float type) How much correction is used: 0 means no correction while 1 means full correction - beta=0.4, - ) - ), - ), -) -pitfall_ppo_rnd_config = EasyDict(pitfall_ppo_rnd_config) -main_config = pitfall_ppo_rnd_config -pitfall_ppo_rnd_create_config = dict( - env=dict( - type='atari', - import_names=['dizoo.atari.envs.atari_env'], - ), - env_manager=dict(type='subprocess'), - policy=dict(type='ngu'), - rnd_reward_model=dict(type='rnd-ngu'), - episodic_reward_model=dict(type='episodic'), -) -pitfall_ppo_rnd_create_config = EasyDict(pitfall_ppo_rnd_create_config) -create_config = pitfall_ppo_rnd_create_config - -if __name__ == "__main__": - from ding.entry import serial_pipeline_reward_model_ngu - serial_pipeline_reward_model_ngu([main_config, create_config], seed=0) diff --git a/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py b/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py index 505b75b626..5c20b0a0c4 100644 --- a/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py +++ b/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py @@ -14,7 +14,9 @@ reward_model=dict( type='gail', input_size=[4, 84, 84], - hidden_size=128, + hidden_size_list=[16, 16, 16, 16, 64], + kernel_size=[7, 5, 3, 3], + stride=[3, 2, 2, 1], batch_size=64, learning_rate=1e-3, update_per_collect=100, @@ -29,7 +31,7 @@ # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. # Absolute path is recommended. # In DI-engine, it is usually located in ``exp_name`` directory - # e.g. 'exp_name/expert_data.pkl' + # e.g. 'exp_name' data_path='data_path_placeholder', ), policy=dict( @@ -78,13 +80,17 @@ # or you can enter `ding -m serial_gail -c pong_gail_dqn_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. pong_dqn_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.atari.config.serial.pong import pong_dqn_config, pong_dqn_create_config - expert_main_config = pong_dqn_config - expert_create_config = pong_dqn_create_config - serial_pipeline_gail( - (main_config, create_config), (expert_main_config, expert_create_config), - max_env_step=1000000, - seed=0, - collect_data=True - ) \ No newline at end of file + + # set your expert config here + expert_cfg = (pong_dqn_config, pong_dqn_create_config) + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, expert_data_path=expert_data_path, collect_count=main_config.reward_model.collect_count + ) + + # train reward model + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/atari/config/serial/pong/pong_ngu_config.py b/dizoo/atari/config/serial/pong/pong_ngu_config.py index 065dce2186..463e390d8f 100644 --- a/dizoo/atari/config/serial/pong/pong_ngu_config.py +++ b/dizoo/atari/config/serial/pong/pong_ngu_config.py @@ -15,44 +15,49 @@ stop_value=20, frame_stack=4, ), - rnd_reward_model=dict( - intrinsic_reward_type='add', - learning_rate=1e-4, - obs_shape=[4, 84, 84], - action_shape=6, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=False, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=1, - intrinsic_reward_type='add', - learning_rate=1e-4, - obs_shape=[4, 84, 84], - action_shape=6, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', + reward_model=dict( + type='ngu-reward', + policy_nstep=nstep, + collect_env_num=collector_env_num, + rnd_reward_model=dict( + intrinsic_reward_type='add', + learning_rate=1e-4, + obs_shape=[4, 84, 84], + action_shape=6, + batch_size=320, + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='rnd-ngu', + ), + episodic_reward_model=dict( + # means if using rescale trick to the last non-zero reward + # when combing extrinsic and intrinsic reward. + # the rescale trick only used in: + # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal + # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander + # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, + # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the + # original last nonzero extrinsic reward. + # please refer to ngu_reward_model for details. + last_nonzero_reward_rescale=False, + # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True + # please refer to ngu_reward_model for details. + last_nonzero_reward_weight=1, + intrinsic_reward_type='add', + learning_rate=1e-4, + obs_shape=[4, 84, 84], + action_shape=6, + batch_size=320, + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='episodic', + ), ), policy=dict( cuda=True, @@ -116,13 +121,12 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ngu'), - rnd_reward_model=dict(type='rnd-ngu'), - episodic_reward_model=dict(type='episodic'), + reward_model=dict(type='ngu-reward'), ) pong_ppo_rnd_create_config = EasyDict(pong_ppo_rnd_create_config) create_config = pong_ppo_rnd_create_config if __name__ == "__main__": # or you can enter `ding -m serial_ngu -c pong_ngu_config.py -s 0` - from ding.entry import serial_pipeline_ngu - serial_pipeline_ngu([main_config, create_config], seed=0) + from ding.entry import serial_pipeline_reward_model_offpolicy + serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0) diff --git a/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py b/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py index 3351931380..af58398a68 100644 --- a/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py +++ b/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py @@ -13,6 +13,7 @@ ), reward_model=dict( type='trex', + exp_name='pong_trex_offppo_seed0', min_snippet_length=50, max_snippet_length=100, checkpoint_min=0, @@ -24,14 +25,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /pong.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + expert_model_path='pong_ppo_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=[4, 84, 84], + action_shape=6, ), policy=dict( cuda=True, @@ -80,6 +77,7 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ppo_offpolicy'), + reward_model=dict(type='trex'), ) pong_trex_ppo_create_config = EasyDict(pong_trex_ppo_create_config) create_config = pong_trex_ppo_create_config @@ -91,7 +89,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_preference_based_irl + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -99,4 +97,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_preference_based_irl((main_config, create_config)) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/atari/config/serial/pong/pong_trex_sql_config.py b/dizoo/atari/config/serial/pong/pong_trex_sql_config.py index e1d4991294..991aae76f2 100644 --- a/dizoo/atari/config/serial/pong/pong_trex_sql_config.py +++ b/dizoo/atari/config/serial/pong/pong_trex_sql_config.py @@ -13,6 +13,7 @@ ), reward_model=dict( type='trex', + exp_name='pong_trex_sql_seed0', min_snippet_length=50, max_snippet_length=100, checkpoint_min=10000, @@ -24,14 +25,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /pong.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + expert_model_path='pong_sql_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=[4, 84, 84], + action_shape=6, ), policy=dict( cuda=False, @@ -65,6 +62,7 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='sql'), + reward_model=dict(type='trex'), ) pong_trex_sql_create_config = EasyDict(pong_trex_sql_create_config) create_config = pong_trex_sql_create_config @@ -76,7 +74,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_preference_based_irl + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -84,4 +82,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_preference_based_irl((main_config, create_config)) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py b/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py index 95d1d9716d..9925075f8a 100644 --- a/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py +++ b/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py @@ -13,6 +13,7 @@ ), reward_model=dict( type='trex', + exp_name='qbert_trex_dqn_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=0, @@ -20,9 +21,14 @@ checkpoint_step=100, learning_rate=1e-5, update_per_collect=1, - expert_model_path='abs model path', - reward_model_path='abs data path + ./qbert.params', - offline_data_path='abs data path', + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. + # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. + expert_model_path='qbert_dqn_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=[4, 84, 84], + action_shape=6, ), policy=dict( cuda=True, @@ -62,6 +68,7 @@ ), env_manager=dict(type='base'), policy=dict(type='dqn'), + reward_model=dict(type='trex'), ) qbert_trex_dqn_create_config = EasyDict(qbert_trex_dqn_create_config) create_config = qbert_trex_dqn_create_config @@ -74,7 +81,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_reward_model_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') @@ -83,4 +90,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_trex((main_config, create_config)) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py b/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py index 3621edc462..bb80daa3c4 100644 --- a/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py +++ b/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py @@ -13,6 +13,7 @@ ), reward_model=dict( type='trex', + exp_name='qbert_trex_offppo_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=0, @@ -20,9 +21,14 @@ checkpoint_step=100, learning_rate=1e-5, update_per_collect=1, - expert_model_path='abs model path', - reward_model_path='abs data path + ./qbert.params', - offline_data_path='abs data path', + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. + # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. + expert_model_path='qbert_ppo_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=[4, 84, 84], + action_shape=6, ), policy=dict( cuda=True, @@ -69,6 +75,7 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ppo_offpolicy'), + reward_model=dict(type='trex'), ) create_config = EasyDict(qbert_trex_ppo_create_config) @@ -80,7 +87,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_reward_model_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') @@ -89,4 +96,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_trex((main_config, create_config)) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_dqn_config.py b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_dqn_config.py index 1a4491d68a..22c6a4e65a 100644 --- a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_dqn_config.py +++ b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_dqn_config.py @@ -15,6 +15,7 @@ ), reward_model=dict( type='trex', + exp_name='spaceinvaders_trex_dqn_seed0', min_snippet_length=50, max_snippet_length=100, checkpoint_min=10000, @@ -28,17 +29,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name``. # For example, if you want to use dqn to generate demos, you can use ``spaceinvaders_dqn`` - expert_model_path='model_path_placeholder', - # path to save reward model - # Users should add their own model path here. - # Absolute path is recommended. - # For example, if you use ``spaceinvaders_drex``, then the reward model will be saved in this directory. - reward_model_path='model_path_placeholder + ./spaceinvaders.params', - # path to save generated observations. - # Users should add their own model path here. - # Absolute path is recommended. - # For example, if you use ``spaceinvaders_drex``, then all the generated data will be saved in this directory. - offline_data_path='data_path_placeholder', + expert_model_path='spaceinvaders_dqn_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=[4, 84, 84], + action_shape=6, ), policy=dict( cuda=True, @@ -78,6 +72,7 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='dqn'), + reward_model=dict(type='trex'), ) spaceinvaders_trex_dqn_create_config = EasyDict(spaceinvaders_trex_dqn_create_config) create_config = spaceinvaders_trex_dqn_create_config @@ -89,7 +84,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -97,4 +92,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex([main_config, create_config]) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_offppo_config.py b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_offppo_config.py index 7934a67a7d..c75fa79171 100644 --- a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_offppo_config.py +++ b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_offppo_config.py @@ -15,6 +15,7 @@ ), reward_model=dict( type='trex', + exp_name='spaceinvaders_trex_offppo_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=0, @@ -27,17 +28,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name``. # For example, if you want to use dqn to generate demos, you can use ``spaceinvaders_dqn`` - expert_model_path='model_path_placeholder', - # path to save reward model - # Users should add their own model path here. - # Absolute path is recommended. - # For example, if you use ``spaceinvaders_drex``, then the reward model will be saved in this directory. - reward_model_path='model_path_placeholder + ./spaceinvaders.params', - # path to save generated observations. - # Users should add their own model path here. - # Absolute path is recommended. - # For example, if you use ``spaceinvaders_drex``, then all the generated data will be saved in this directory. - offline_data_path='data_path_placeholder', + expert_model_path='spaceinvaders_ppo_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=[4, 84, 84], + action_shape=6, ), policy=dict( cuda=True, @@ -85,6 +79,7 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ppo_offpolicy'), + reward_model=dict(type='trex'), ) spaceinvaders_trex_ppo_create_config = EasyDict(spaceinvaders_trex_ppo_create_config) create_config = spaceinvaders_trex_ppo_create_config @@ -96,7 +91,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -104,4 +99,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex([main_config, create_config]) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py b/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py index 4ef3d1b068..8fca3d69db 100755 --- a/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py +++ b/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py @@ -18,7 +18,7 @@ reward_model=dict( type='gail', input_size=obs_shape + act_shape, - hidden_size=64, + hidden_size_list=[64], batch_size=64, learning_rate=1e-3, update_per_collect=100, @@ -87,10 +87,17 @@ # or you can enter `ding -m serial_gail -c bipedalwalker_sac_gail_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. bipedalwalker_sac_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.box2d.bipedalwalker.config import bipedalwalker_sac_config, bipedalwalker_sac_create_config - expert_main_config = bipedalwalker_sac_config - expert_create_config = bipedalwalker_sac_create_config - serial_pipeline_gail( - [main_config, create_config], [expert_main_config, expert_create_config], seed=0, collect_data=True + + # set your expert config here + expert_cfg = (bipedalwalker_sac_config, bipedalwalker_sac_create_config) + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, expert_data_path=expert_data_path, collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py b/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py index 0e60fce608..f8a8ab47e7 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py @@ -28,9 +28,7 @@ learning_rate_alpha=3e-4, auto_alpha=True, ), - collect=dict( - n_sample=256, - ), + collect=dict(n_sample=256, ), eval=dict(evaluator=dict(eval_freq=1000, ), ), other=dict(replay_buffer=dict(replay_buffer_size=int(1e5), ), ), ), diff --git a/dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py b/dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py new file mode 100644 index 0000000000..d38343fbc7 --- /dev/null +++ b/dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py @@ -0,0 +1,124 @@ +from easydict import EasyDict + +nstep = 1 +lunarlander_drex_dqn_config = dict( + exp_name='lunarlander_drex_dqn_seed0', + env=dict( + # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' + # Env number respectively for collector and evaluator. + collector_env_num=8, + evaluator_env_num=8, + env_id='LunarLander-v2', + n_evaluator_episode=8, + stop_value=200, + ), + reward_model=dict( + type='drex', + exp_name='lunarlander_drex_dqn_seed0', + min_snippet_length=30, + max_snippet_length=100, + checkpoint_min=1000, + checkpoint_max=9000, + checkpoint_step=1000, + num_snippets=60000, + num_trajs_per_bin=20, + num_trajs=6, + bc_iterations=6000, + learning_rate=1e-5, + update_per_collect=1, + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. + expert_model_path='lunarlander_dqn_seed0/ckpt/ckpt_best.pth.tar', + reward_model_path='lunarlander_dqn_seed0/cartpole.params', + offline_data_path='lunarlander_drex_dqn_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=8, + action_shape=4, + eps_list=[0, 0.5, 1], + ), + policy=dict( + # Whether to use cuda for network. + cuda=False, + model=dict( + obs_shape=8, + action_shape=4, + encoder_hidden_size_list=[512, 64], + # Whether to use dueling head. + dueling=True, + ), + # Reward's future discount factor, aka. gamma. + discount_factor=0.99, + # How many steps in td error. + nstep=nstep, + # learn_mode config + learn=dict( + update_per_collect=10, + batch_size=64, + learning_rate=0.001, + # Frequency of target network update. + target_update_freq=100, + ), + # collect_mode config + collect=dict( + # You can use either "n_sample" or "n_episode" in collector.collect. + # Get "n_sample" samples per collect. + n_sample=64, + # Cut trajectories into pieces with length "unroll_len". + unroll_len=1, + collector=dict( + get_train_sample=False, + reward_shaping=False, + ), + ), + # command_mode config + other=dict( + # Epsilon greedy with decay. + eps=dict( + # Decay type. Support ['exp', 'linear']. + type='exp', + start=0.95, + end=0.1, + decay=50000, + ), + replay_buffer=dict(replay_buffer_size=100000, ) + ), + ), +) +lunarlander_drex_dqn_config = EasyDict(lunarlander_drex_dqn_config) +main_config = lunarlander_drex_dqn_config + +lunarlander_drex_dqn_create_config = dict( + env=dict( + type='lunarlander', + import_names=['dizoo.box2d.lunarlander.envs.lunarlander_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='dqn'), + reward_model=dict(type='drex'), +) +lunarlander_drex_dqn_create_config = EasyDict(lunarlander_drex_dqn_create_config) +create_config = lunarlander_drex_dqn_create_config + +if __name__ == '__main__': + # Users should first run ``lunarlander_dqn_config.py`` to save models (or checkpoints). + # Note: Users should check that the checkpoints generated should include iteration_'checkpoint_min'.pth.tar, iteration_'checkpoint_max'.pth.tar with the interval checkpoint_step + # where checkpoint_max, checkpoint_min, checkpoint_step are specified above. + import argparse + import torch + from ding.config import read_config + from ding.entry import drex_collecting_data + from ding.entry import serial_pipeline_reward_model_offpolicy + parser = argparse.ArgumentParser() + parser.add_argument('--cfg', type=str, default='please enter abs path for this file') + parser.add_argument('--seed', type=int, default=0) + parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') + args = parser.parse_args() + args.cfg = read_config(args.cfg) + args.cfg[1].policy.type = 'bc' + args.cfg[0].policy.collect.n_episode = 64 + del args.cfg[0].policy.collect.n_sample + drex_collecting_data(args) + serial_pipeline_reward_model_offpolicy( + (main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False, max_env_step=int(1e7) + ) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py b/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py index 855f845980..c992a1263a 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py @@ -15,7 +15,7 @@ reward_model=dict( type='gail', input_size=9, - hidden_size=64, + hidden_size_list=[64], batch_size=64, learning_rate=1e-3, update_per_collect=100, @@ -29,7 +29,7 @@ # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. # Absolute path is recommended. # In DI-engine, it is usually located in ``exp_name`` directory - # e.g. 'exp_name/expert_data.pkl' + # e.g. 'exp_name' data_path='data_path_placeholder', ), policy=dict( @@ -96,13 +96,17 @@ # or you can enter `ding -m serial_gail -c lunarlander_dqn_gail_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. lunarlander_dqn_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.box2d.lunarlander.config import lunarlander_dqn_config, lunarlander_dqn_create_config - expert_main_config = lunarlander_dqn_config - expert_create_config = lunarlander_dqn_create_config - serial_pipeline_gail( - [main_config, create_config], [expert_main_config, expert_create_config], - max_env_step=1000000, - seed=0, - collect_data=True + + # set your expert config here + expert_cfg = (lunarlander_dqn_config, lunarlander_dqn_create_config) + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, expert_data_path=expert_data_path, collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py b/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py index 60065ae33b..2def4718fe 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py @@ -15,6 +15,16 @@ batch_size=32, continuous=False, update_per_collect=20, + # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. + # Absolute path is recommended. + # In DI-engine, it is usually located in ``exp_name`` directory + # e.g. 'exp_name/expert_data.pkl' + expert_data_path='lunarlander_ppo_offpolicy_seed0/expert_data.pkl', + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. + expert_model_path='lunarlander_ppo_offpolicy_seed0/ckpt/ckpt_best.pth.tar', + collect_count=100000, ), policy=dict( cuda=False, @@ -35,13 +45,6 @@ adv_norm=True, ), collect=dict( - # Users should add their own model path here. Model path should lead to a model. - # Absolute path is recommended. - # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. - model_path='model_path_placeholder', - # If you need the data collected by the collector to contain logit key which reflect the probability of - # the action, you can change the key to be True. - # In Guided cost Learning, we need to use logit to train the reward model, we change the key to be True. collector_logit=True, n_sample=800, unroll_len=1, @@ -65,5 +68,19 @@ create_config = lunarlander_ppo_create_config if __name__ == "__main__": - from ding.entry import serial_pipeline_guided_cost - serial_pipeline_guided_cost([main_config, create_config], seed=0) + # or you can enter `ding -m serial -c lunarlander_ppo_offpolicy_config.py -s 0` + from ding.entry import collect_demo_data, serial_pipeline_reward_model_offpolicy + from dizoo.box2d.lunarlander.config.lunarlander_offppo_config import lunarlander_ppo_offpolicy_config, lunarlander_ppo_offpolicy_create_config + + expert_cfg = (lunarlander_ppo_offpolicy_config, lunarlander_ppo_offpolicy_create_config) + expert_data_path = main_config.reward_model.expert_data_path + state_dict_path = main_config.reward_model.expert_model_path + collect_count = main_config.reward_model.collect_count + collect_demo_data( + expert_cfg, + seed=0, + state_dict_path=state_dict_path, + expert_data_path=expert_data_path, + collect_count=collect_count + ) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py b/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py index d4cb7cfe26..f19b2bfa71 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py @@ -13,44 +13,49 @@ n_evaluator_episode=evaluator_env_num, stop_value=195, ), - rnd_reward_model=dict( - intrinsic_reward_type='add', - learning_rate=5e-4, - obs_shape=8, - action_shape=4, - batch_size=320, # transitions - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=5, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=True, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=100, - intrinsic_reward_type='add', - learning_rate=5e-4, - obs_shape=8, - action_shape=4, - batch_size=320, # transitions - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', + reward_model=dict( + type='ngu-reward', + policy_nstep=nstep, + collect_env_num=collector_env_num, + rnd_reward_model=dict( + intrinsic_reward_type='add', + learning_rate=5e-4, + obs_shape=8, + action_shape=4, + batch_size=320, # transitions + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=5, + hidden_size_list=[128, 128, 64], + type='rnd-ngu', + ), + episodic_reward_model=dict( + # This refers to the use of a rescaling method applied to the final non-zero reward. + # when combing the extrinsic and intrinsic reward. + # the rescale trick only used in: + # 1. Environments with sparse rewards, such as the MiniGrid, where the final non-zero reward provides a significant positive signal. + # 2. Situations where the last reward of each episode directly corresponds to the agent's task completion, such as in the LunarLander environment. + # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, + # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the + # original last nonzero extrinsic reward. + # please refer to ngu_reward_model for details. + last_nonzero_reward_rescale=True, + # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True + # please refer to ngu_reward_model for details. + last_nonzero_reward_weight=100, + intrinsic_reward_type='add', + learning_rate=5e-4, + obs_shape=8, + action_shape=4, + batch_size=320, # transitions + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='episodic', + ), ), policy=dict( cuda=True, @@ -118,13 +123,12 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ngu'), - rnd_reward_model=dict(type='rnd-ngu'), - episodic_reward_model=dict(type='episodic'), + reward_model=dict(type='ngu-reward'), ) lunarlander_ngu_create_config = EasyDict(lunarlander_ngu_create_config) create_config = lunarlander_ngu_create_config if __name__ == "__main__": # or you can enter `ding -m serial_ngu -c lunarlander_ngu_config.py -s 0` - from ding.entry import serial_pipeline_ngu - serial_pipeline_ngu([main_config, create_config], seed=0) + from ding.entry import serial_pipeline_reward_model_offpolicy + serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py b/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py index 790ca5c271..f3e3fc61cd 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py @@ -14,6 +14,7 @@ ), reward_model=dict( type='trex', + exp_name='lunarlander_trex_dqn_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=1000, @@ -25,14 +26,10 @@ # Users should add their own model path here. Model path should lead to a model. # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. - expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /lunarlander.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # e.g. 'exp_name/expert_data.pkl' - data_path='data_path_placeholder', + expert_model_path='lunarlander_dqn_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=8, + action_shape=4, ), policy=dict( # Whether to use cuda for network. @@ -88,6 +85,7 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='dqn'), + reward_model=dict(type='trex'), ) lunarlander_trex_dqn_create_config = EasyDict(lunarlander_trex_dqn_create_config) create_config = lunarlander_trex_dqn_create_config @@ -99,7 +97,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -107,4 +105,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex([main_config, create_config]) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py b/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py index 37e2c78fdd..a0e7ceb024 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py @@ -11,6 +11,7 @@ ), reward_model=dict( type='trex', + exp_name='lunarlander_trex_offppo_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=1000, @@ -22,14 +23,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /lunarlander.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + expert_model_path='lunarlander_offppo_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=8, + action_shape=4, ), policy=dict( cuda=True, @@ -76,7 +73,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -84,4 +81,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex([main_config, create_config]) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/classic_control/cartpole/config/__init__.py b/dizoo/classic_control/cartpole/config/__init__.py index 3d6d124274..0891fdb75d 100644 --- a/dizoo/classic_control/cartpole/config/__init__.py +++ b/dizoo/classic_control/cartpole/config/__init__.py @@ -4,7 +4,7 @@ from .cartpole_dqfd_config import cartpole_dqfd_config, cartpole_dqfd_create_config from .cartpole_dqn_config import cartpole_dqn_config, cartpole_dqn_create_config from .cartpole_dqn_gail_config import cartpole_dqn_gail_config, cartpole_dqn_gail_create_config -from .cartpole_gcl_config import cartpole_gcl_ppo_onpolicy_config, cartpole_gcl_ppo_onpolicy_create_config +from .cartpole_gcl_config import cartpole_gcl_ppo_offpolicy_config, cartpole_gcl_ppo_offpolicy_create_config from .cartpole_impala_config import cartpole_impala_config, cartpole_impala_create_config from .cartpole_iqn_config import cartpole_iqn_config, cartpole_iqn_create_config from .cartpole_ppo_offpolicy_config import cartpole_ppo_offpolicy_config, cartpole_ppo_offpolicy_create_config diff --git a/dizoo/classic_control/cartpole/config/cartpole_bc_config.py b/dizoo/classic_control/cartpole/config/cartpole_bc_config.py index 8315e934fe..b1975718f3 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_bc_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_bc_config.py @@ -20,7 +20,7 @@ batch_size=64, learning_rate=0.01, learner=dict(hook=dict(save_ckpt_after_iter=1000)), - train_epoch = 20, + train_epoch=20, ), eval=dict(evaluator=dict(eval_freq=40, )) ), diff --git a/dizoo/classic_control/cartpole/config/cartpole_dqn_config.py b/dizoo/classic_control/cartpole/config/cartpole_dqn_config.py index 3e5ca613d0..0e76e2a081 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_dqn_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_dqn_config.py @@ -24,6 +24,7 @@ update_per_collect=5, batch_size=64, learning_rate=0.001, + learner=dict(hook=dict(save_ckpt_after_iter=100)), ), collect=dict(n_sample=8), eval=dict(evaluator=dict(eval_freq=40, )), diff --git a/dizoo/classic_control/cartpole/config/cartpole_dqn_gail_config.py b/dizoo/classic_control/cartpole/config/cartpole_dqn_gail_config.py index b438e648e3..4300f417c8 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_dqn_gail_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_dqn_gail_config.py @@ -11,7 +11,7 @@ reward_model=dict( type='gail', input_size=5, - hidden_size=64, + hidden_size_list=[64], batch_size=64, learning_rate=1e-3, update_per_collect=100, @@ -20,7 +20,10 @@ # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # If collect_data is True, we will use this expert_model_path to collect expert data first, rather than we # will load data directly from user-defined data_path - expert_model_path='model_path_placeholder', + # data_path is the path to store expert policy data, which is used to train reward model + # so in general, data_path is the same as expert exp name + expert_model_path='cartpole_dqn_seed0/ckpt/ckpt_best.pth.tar', + data_path='cartpole_dqn_seed0', collect_count=1000, ), policy=dict( @@ -68,13 +71,22 @@ # or you can enter `ding -m serial_gail -c cartpole_dqn_gail_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. cartpole_dqn_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.classic_control.cartpole.config import cartpole_dqn_config, cartpole_dqn_create_config + + # set expert config from policy config in dizoo + expert_cfg = (cartpole_dqn_config, cartpole_dqn_create_config) expert_main_config = cartpole_dqn_config - expert_create_config = cartpole_dqn_create_config - serial_pipeline_gail( - (main_config, create_config), (expert_main_config, expert_create_config), - max_env_step=1000000, + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, - collect_data=True + state_dict_path=main_config.reward_model.expert_model_path, + expert_data_path=expert_data_path, + collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/classic_control/cartpole/config/cartpole_drex_dqn_config.py b/dizoo/classic_control/cartpole/config/cartpole_drex_dqn_config.py index c898528a39..7c617aad85 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_drex_dqn_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_drex_dqn_config.py @@ -11,11 +11,12 @@ ), reward_model=dict( type='drex', + exp_name='cartpole_drex_dqn_seed0', min_snippet_length=5, max_snippet_length=100, checkpoint_min=0, - checkpoint_max=1000, - checkpoint_step=1000, + checkpoint_max=760, + checkpoint_step=760, learning_rate=1e-5, update_per_collect=1, # path to expert models that generate demonstration data @@ -23,25 +24,30 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name``. # For example, if you want to use dqn to generate demos, you can use ``spaceinvaders_dqn`` - expert_model_path='expert_model_path_placeholder', + expert_model_path='cartpole_dqn_seed0/ckpt/ckpt_best.pth.tar', # path to save reward model # Users should add their own model path here. # Absolute path is recommended. # For example, if you use ``spaceinvaders_drex``, then the reward model will be saved in this directory. - reward_model_path='reward_model_path_placeholder + ./spaceinvaders.params', + reward_model_path='cartpole_drex_dqn_seed0/cartpole.params', # path to save generated observations. # Users should add their own model path here. # Absolute path is recommended. # For example, if you use ``spaceinvaders_drex``, then all the generated data will be saved in this directory. - offline_data_path='offline_data_path_placeholder', + offline_data_path='cartpole_drex_dqn_seed0', # path to pretrained bc model. If omitted, bc will be trained instead. # Users should add their own model path here. Model path should lead to a model ckpt. # Absolute path is recommended. - bc_path='bc_path_placeholder', + # bc_path='bc_path_placeholder', # list of noises eps_list=[0, 0.5, 1], num_trajs_per_bin=20, + num_trajs=6, + num_snippets=6000, bc_iterations=6000, + hidden_size_list=[512, 64, 1], + obs_shape=4, + action_shape=2, ), policy=dict( cuda=False, @@ -57,7 +63,13 @@ batch_size=64, learning_rate=0.001, ), - collect=dict(n_sample=8, collector=dict(get_train_sample=False, )), + collect=dict( + n_sample=8, + collector=dict( + get_train_sample=False, + reward_shaping=False, + ), + ), eval=dict(evaluator=dict(eval_freq=40, )), other=dict( eps=dict( @@ -66,7 +78,7 @@ end=0.1, decay=10000, ), - replay_buffer=dict(replay_buffer_size=20000, ), + replay_buffer=dict(replay_buffer_size=200000, ), ), ), ) @@ -79,7 +91,24 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='dqn'), - collector=dict(type='episode'), ) cartpole_drex_dqn_create_config = EasyDict(cartpole_drex_dqn_create_config) create_config = cartpole_drex_dqn_create_config + +if __name__ == "__main__": + import argparse + import torch + from ding.config import read_config + from ding.entry import drex_collecting_data + from ding.entry import serial_pipeline_reward_model_offpolicy + parser = argparse.ArgumentParser() + parser.add_argument('--cfg', type=str, default='please enter abs path for this file') + parser.add_argument('--seed', type=int, default=0) + parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') + args = parser.parse_args() + args.cfg = read_config(args.cfg) + args.cfg[1].policy.type = 'bc' + args.cfg[0].policy.collect.n_episode = 8 + del args.cfg[0].policy.collect.n_sample + drex_collecting_data(args) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py b/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py index c4c8faf083..d71eaf3adb 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py @@ -1,7 +1,7 @@ from easydict import EasyDict -cartpole_gcl_ppo_onpolicy_config = dict( - exp_name='cartpole_gcl_seed0', +cartpole_gcl_ppo_offpolicy_config = dict( + exp_name='cartpole_gcl_offpolicy_seed0', env=dict( collector_env_num=8, evaluator_env_num=5, @@ -13,56 +13,75 @@ input_size=5, batch_size=32, continuous=False, + # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. + # Absolute path is recommended. + # In DI-engine, it is usually located in ``exp_name`` directory + # e.g. 'exp_name/expert_data.pkl' + expert_data_path='cartpole_ppo_offpolicy_seed0/expert_data.pkl', + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. + expert_model_path='cartpole_ppo_offpolicy_seed0/ckpt/ckpt_best.pth.tar', update_per_collect=10, + collect_count=1000, ), policy=dict( cuda=False, - recompute_adv=True, - action_space='discrete', model=dict( obs_shape=4, action_shape=2, - action_space='discrete', encoder_hidden_size_list=[64, 64, 128], critic_head_hidden_size=128, actor_head_hidden_size=128, + action_space='discrete', ), learn=dict( - update_per_collect=2, + update_per_collect=6, batch_size=64, learning_rate=0.001, + value_weight=0.5, entropy_weight=0.01, + clip_ratio=0.2, + learner=dict(hook=dict(save_ckpt_after_iter=1000)), ), collect=dict( - # Users should add their own model path here. Model path should lead to a model. - # Absolute path is recommended. - # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. - model_path='model_path_placeholder', - # If you need the data collected by the collector to contain logit key which reflect the probability of - # the action, you can change the key to be True. - # In Guided cost Learning, we need to use logit to train the reward model, we change the key to be True. - collector_logit=True, # add logit into collected transition - n_sample=256, + n_sample=128, + unroll_len=1, discount_factor=0.9, gae_lambda=0.95, ), - eval=dict(evaluator=dict(eval_freq=50, ), ), + eval=dict(evaluator=dict(eval_freq=40, )), + other=dict(replay_buffer=dict(replay_buffer_size=5000)) ), ) -cartpole_gcl_ppo_onpolicy_config = EasyDict(cartpole_gcl_ppo_onpolicy_config) -main_config = cartpole_gcl_ppo_onpolicy_config -cartpole_gcl_ppo_onpolicy_create_config = dict( +cartpole_gcl_ppo_offpolicy_config = EasyDict(cartpole_gcl_ppo_offpolicy_config) +main_config = cartpole_gcl_ppo_offpolicy_config +cartpole_gcl_ppo_offpolicy_create_config = dict( env=dict( type='cartpole', import_names=['dizoo.classic_control.cartpole.envs.cartpole_env'], ), env_manager=dict(type='base'), - policy=dict(type='ppo'), + policy=dict(type='ppo_offpolicy'), reward_model=dict(type='guided_cost'), ) -cartpole_gcl_ppo_onpolicy_create_config = EasyDict(cartpole_gcl_ppo_onpolicy_create_config) -create_config = cartpole_gcl_ppo_onpolicy_create_config +cartpole_gcl_ppo_offpolicy_create_config = EasyDict(cartpole_gcl_ppo_offpolicy_create_config) +create_config = cartpole_gcl_ppo_offpolicy_create_config if __name__ == "__main__": - from ding.entry import serial_pipeline_guided_cost - serial_pipeline_guided_cost((main_config, create_config), seed=0) + # or you can enter `ding -m serial -c cartpole_ppo_offpolicy_config.py -s 0` + from ding.entry import collect_demo_data, serial_pipeline_reward_model_offpolicy + from dizoo.classic_control.cartpole.config.cartpole_ppo_offpolicy_config import cartpole_ppo_offpolicy_config, cartpole_ppo_offpolicy_create_config + + expert_cfg = (cartpole_ppo_offpolicy_config, cartpole_ppo_offpolicy_create_config) + expert_data_path = main_config.reward_model.expert_data_path + state_dict_path = main_config.reward_model.expert_model_path + collect_count = main_config.reward_model.collect_count + collect_demo_data( + expert_cfg, + seed=0, + state_dict_path=state_dict_path, + expert_data_path=expert_data_path, + collect_count=collect_count + ) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/classic_control/cartpole/config/cartpole_ngu_config.py b/dizoo/classic_control/cartpole/config/cartpole_ngu_config.py index 3aecbbb01b..df004328f2 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_ngu_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_ngu_config.py @@ -12,44 +12,49 @@ obs_plus_prev_action_reward=True, # use specific env wrapper for ngu policy stop_value=195, ), - rnd_reward_model=dict( - intrinsic_reward_type='add', - learning_rate=5e-4, - obs_shape=4, - action_shape=2, - batch_size=128, # transitions - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=False, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=1, - intrinsic_reward_type='add', - learning_rate=5e-4, - obs_shape=4, - action_shape=2, - batch_size=128, # transitions - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', + reward_model=dict( + type='ngu-reward', + policy_nstep=5, + collect_env_num=8, + rnd_reward_model=dict( + intrinsic_reward_type='add', + learning_rate=5e-4, + obs_shape=4, + action_shape=2, + batch_size=128, # transitions + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='rnd-ngu', + ), + episodic_reward_model=dict( + # means if using rescale trick to the last non-zero reward + # when combing extrinsic and intrinsic reward. + # the rescale trick only used in: + # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal + # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander + # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, + # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the + # original last nonzero extrinsic reward. + # please refer to ngu_reward_model for details. + last_nonzero_reward_rescale=False, + # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True + # please refer to ngu_reward_model for details. + last_nonzero_reward_weight=1, + intrinsic_reward_type='add', + learning_rate=5e-4, + obs_shape=4, + action_shape=2, + batch_size=128, # transitions + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='episodic', + ), ), policy=dict( cuda=True, @@ -112,13 +117,12 @@ ), env_manager=dict(type='base'), policy=dict(type='ngu'), - rnd_reward_model=dict(type='rnd-ngu'), - episodic_reward_model=dict(type='episodic'), + reward_model=dict(type='ngu-reward'), ) cartpole_ngu_create_config = EasyDict(cartpole_ngu_create_config) create_config = cartpole_ngu_create_config if __name__ == "__main__": # or you can enter `ding -m serial_ngu -c cartpole_ngu_config.py -s 0` - from ding.entry import serial_pipeline_ngu - serial_pipeline_ngu([main_config, create_config], seed=0) + from ding.entry import serial_pipeline_reward_model_offpolicy + serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0) diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py index 306cadd6f2..df23981ecd 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py @@ -10,6 +10,7 @@ ), reward_model=dict( type='trex', + exp_name='cartpole_trex_dqn_seed0', min_snippet_length=5, max_snippet_length=100, checkpoint_min=0, @@ -19,7 +20,13 @@ update_per_collect=1, num_trajs=6, num_snippets=6000, + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name``. expert_model_path='cartpole_dqn_seed0', # expert model experiment directory path + hidden_size_list=[512, 64, 1], + obs_shape=4, + action_shape=2, ), policy=dict( cuda=False, @@ -58,6 +65,7 @@ ), env_manager=dict(type='base'), policy=dict(type='dqn'), + reward_model=dict(type='trex'), ) cartpole_trex_dqn_create_config = EasyDict(cartpole_trex_dqn_create_config) create_config = cartpole_trex_dqn_create_config @@ -66,10 +74,12 @@ # Users should first run ``cartpole_dqn_config.py`` to save models (or checkpoints). # Note: Users should check that the checkpoints generated should include iteration_'checkpoint_min'.pth.tar, iteration_'checkpoint_max'.pth.tar with the interval checkpoint_step # where checkpoint_max, checkpoint_min, checkpoint_step are specified above. + # example of running this file: + # python cartpole_trex_dqn_config.py --cfg cartpole_trex_dqn_config.py --seed 0 --device cpu import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_reward_model_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -77,4 +87,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_trex((main_config, create_config)) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py index b58535f900..b8d7c5887f 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py @@ -10,6 +10,7 @@ ), reward_model=dict( type='trex', + exp_name='cartpole_trex_offppo_seed0', min_snippet_length=5, max_snippet_length=100, checkpoint_min=0, @@ -17,9 +18,15 @@ checkpoint_step=100, learning_rate=1e-5, update_per_collect=1, - expert_model_path='abs model path', - reward_model_path='abs data path + ./cartpole.params', - data_path='abs data path', + num_trajs=0, + num_snippets=6000, + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name``. + expert_model_path='cartpole_ppo_seed0', # expert model experiment directory path + hidden_size_list=[512, 64, 1], + obs_shape=4, + action_shape=2, ), policy=dict( cuda=False, @@ -66,10 +73,12 @@ # Users should first run ``cartpole_offppo_config.py`` to save models (or checkpoints). # Note: Users should check that the checkpoints generated should include iteration_'checkpoint_min'.pth.tar, iteration_'checkpoint_max'.pth.tar with the interval checkpoint_step # where checkpoint_max, checkpoint_min, checkpoint_step are specified above. + # example: + # python cartpole_trex_offppo_config.py --cfg cartpole_trex_offppo_config.py --seed 0 --device cpu import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_reward_model_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -77,4 +86,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_trex((main_config, create_config)) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py index 71b4d4a136..a1b44b5578 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py @@ -10,6 +10,7 @@ ), reward_model=dict( type='trex', + exp_name='cartpole_trex_onppo_seed0', min_snippet_length=5, max_snippet_length=100, checkpoint_min=0, @@ -17,9 +18,13 @@ checkpoint_step=100, learning_rate=1e-5, update_per_collect=1, - expert_model_path='abs model path', - reward_model_path='abs data path + ./cartpole.params', - data_path='abs data path', + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name``. + expert_model_path='cartpole_ppo_seed0', # expert model experiment directory path + hidden_size_list=[512, 64, 1], + obs_shape=4, + action_shape=2, ), policy=dict( cuda=False, @@ -67,10 +72,12 @@ # Users should first run ``cartpole_onppo_config.py`` to save models (or checkpoints). # Note: Users should check that the checkpoints generated should include iteration_'checkpoint_min'.pth.tar, iteration_'checkpoint_max'.pth.tar with the interval checkpoint_step # where checkpoint_max, checkpoint_min, checkpoint_step are specified above. + # example of running this file: + # python cartpole_trex_onppo_config.py --cfg cartpole_trex_onppo_config.py --seed 0 --device cpu import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_reward_model_trex_onpolicy + from ding.entry import serial_pipeline_reward_model_onpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -78,4 +85,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_trex_onpolicy((main_config, create_config)) + serial_pipeline_reward_model_onpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/minigrid/config/minigrid_ngu_config.py b/dizoo/minigrid/config/minigrid_ngu_config.py index c1aa47e1eb..e2b0dd62af 100644 --- a/dizoo/minigrid/config/minigrid_ngu_config.py +++ b/dizoo/minigrid/config/minigrid_ngu_config.py @@ -4,7 +4,7 @@ evaluator_env_num = 8 nstep = 5 minigrid_ppo_ngu_config = dict( - exp_name='minigrid_doorkey_ngu_seed0', + exp_name='minigrid_fourroom_ngu_seed0', env=dict( collector_env_num=collector_env_num, evaluator_env_num=evaluator_env_num, @@ -12,49 +12,54 @@ # typical MiniGrid env id: # {'MiniGrid-Empty-8x8-v0', 'MiniGrid-FourRooms-v0', 'MiniGrid-DoorKey-8x8-v0','MiniGrid-DoorKey-16x16-v0'}, # please refer to https://github.com/Farama-Foundation/MiniGrid for details. - env_id='MiniGrid-DoorKey-16x16-v0', + env_id='MiniGrid-FourRooms-v0', obs_plus_prev_action_reward=True, # use specific env wrapper for ngu policy max_step=300, stop_value=0.96, ), - rnd_reward_model=dict( - intrinsic_reward_type='add', - learning_rate=5e-4, - obs_shape=2835, - action_shape=7, - batch_size=320, # transitions - update_per_collect=10, # 32*100/320=10 - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=True, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=100, - intrinsic_reward_type='add', - learning_rate=5e-4, - obs_shape=2739, - action_shape=7, - batch_size=320, # transitions - update_per_collect=10, # 32*100/64=50 - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', + reward_model=dict( + type='ngu-reward', + policy_nstep=nstep, + collect_env_num=collector_env_num, + rnd_reward_model=dict( + intrinsic_reward_type='add', + learning_rate=5e-4, + obs_shape=2835, + action_shape=7, + batch_size=320, # transitions + update_per_collect=10, # 32*100/320=10 + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='rnd-ngu', + ), + episodic_reward_model=dict( + # means if using rescale trick to the last non-zero reward + # when combing extrinsic and intrinsic reward. + # the rescale trick only used in: + # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal + # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander + # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, + # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the + # original last nonzero extrinsic reward. + # please refer to ngu_reward_model for details. + last_nonzero_reward_rescale=True, + # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True + # please refer to ngu_reward_model for details. + last_nonzero_reward_weight=100, + intrinsic_reward_type='add', + learning_rate=5e-4, + obs_shape=2835, + action_shape=7, + batch_size=320, # transitions + update_per_collect=10, # 32*100/64=50 + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='episodic', + ), ), policy=dict( cuda=True, @@ -68,7 +73,7 @@ # i.e., = = + learn_unroll_len=298, # set this key according to the episode length model=dict( - obs_shape=2739, + obs_shape=2835, action_shape=7, encoder_hidden_size_list=[128, 128, 512], collector_env_num=collector_env_num, @@ -86,7 +91,7 @@ # we want to collect data of length self._traj_len=INF # unless the episode enters the 'done' state. # In each collect phase, we collect a total of sequence samples. - n_sample=32, + n_sample=64, traj_len_inf=True, env_num=collector_env_num, ), @@ -115,15 +120,14 @@ type='minigrid', import_names=['dizoo.minigrid.envs.minigrid_env'], ), - env_manager=dict(type='subprocess'), + env_manager=dict(type='base'), policy=dict(type='ngu'), - rnd_reward_model=dict(type='rnd-ngu'), - episodic_reward_model=dict(type='episodic'), + reward_model=dict(type='ngu-reward'), ) minigrid_ppo_ngu_create_config = EasyDict(minigrid_ppo_ngu_create_config) create_config = minigrid_ppo_ngu_create_config if __name__ == "__main__": # or you can enter `ding -m serial_ngu -c minigrid_ngu_config.py -s 0` - from ding.entry import serial_pipeline_ngu - serial_pipeline_ngu([main_config, create_config], seed=0) + from ding.entry import serial_pipeline_reward_model_offpolicy + serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0, max_env_step=int(1e7)) diff --git a/dizoo/minigrid/envs/minigrid_env.py b/dizoo/minigrid/envs/minigrid_env.py index e0bdbfbc07..b4b4aaecb0 100644 --- a/dizoo/minigrid/envs/minigrid_env.py +++ b/dizoo/minigrid/envs/minigrid_env.py @@ -10,8 +10,7 @@ from matplotlib import animation import matplotlib.pyplot as plt from minigrid.wrappers import FlatObsWrapper, RGBImgPartialObsWrapper, ImgObsWrapper -from .minigrid_wrapper import ViewSizeWrapper -from ding.envs import ObsPlusPrevActRewWrapper +from .minigrid_wrapper import ViewSizeWrapper, ObsPlusPrevActRewWrapper from ding.envs import BaseEnv, BaseEnvTimestep from ding.torch_utils import to_ndarray, to_list @@ -60,7 +59,12 @@ def reset(self) -> np.ndarray: self._env = ObsPlusPrevActRewWrapper(self._env) self._init_flag = True if self._flat_obs: - self._observation_space = gym.spaces.Box(0, 1, shape=(2835, ), dytpe=np.float32) + if type(self._env.observation_space) == gym.spaces.Dict: + obs_space = self._env.observation_space + obs_space['obs'] = gym.spaces.Box(0, 1, shape=(2835, ), dtype=np.float32) + self._observation_space = obs_space + else: + self._observation_space = gym.spaces.Box(0, 1, shape=(2835, ), dtype=np.float32) else: self._observation_space = self._env.observation_space # to be compatiable with subprocess env manager diff --git a/dizoo/minigrid/envs/minigrid_wrapper.py b/dizoo/minigrid/envs/minigrid_wrapper.py index 09a14c9c81..02af761f00 100644 --- a/dizoo/minigrid/envs/minigrid_wrapper.py +++ b/dizoo/minigrid/envs/minigrid_wrapper.py @@ -1,4 +1,5 @@ import gymnasium as gym +import numpy as np from gymnasium import spaces from gymnasium.core import ObservationWrapper @@ -32,3 +33,72 @@ def observation(self, obs): # print('vis_mask:' + vis_mask) image = grid.encode(vis_mask) return {**obs, "image": image} + + +class ObsPlusPrevActRewWrapper(gym.Wrapper): + """ + Overview: + This wrapper is used in policy NGU. + Set a dict {'obs': obs, 'prev_action': self.prev_action, 'prev_reward_extrinsic': self.prev_reward_extrinsic} + as the new wrapped observation, + which including the current obs, previous action and previous reward. + Interface: + ``__init__``, ``reset``, ``step`` + Properties: + - env (:obj:`gymnasium.Env`): the environment to wrap. + """ + + def __init__(self, env): + """ + Overview: + Initialize ``self.`` See ``help(type(self))`` for accurate signature; setup the properties. + Arguments: + - env (:obj:`gymnasium.Env`): the environment to wrap. + """ + super().__init__(env) + self.observation_space = gym.spaces.Dict( + { + 'obs': env.observation_space, + 'prev_action': env.action_space, + 'prev_reward_extrinsic': gym.spaces.Box( + low=env.reward_range[0], high=env.reward_range[1], shape=(1, ), dtype=np.float32 + ) + } + ) + self.prev_action = -1 # null action + self.prev_reward_extrinsic = 0 # null reward + + def reset(self, *, seed: int = None): + """ + Overview: + Resets the state of the environment. + Returns: + - obs (:obj:`Dict`) : the wrapped observation, which including the current obs, \ + previous action and previous reward. + """ + obs, info = self.env.reset(seed=seed) + obs = {'obs': obs, 'prev_action': self.prev_action, 'prev_reward_extrinsic': self.prev_reward_extrinsic} + return obs, info + + def step(self, action): + """ + Overview: + Step the environment with the given action. + Save the previous action and reward to be used in next new obs + Arguments: + - action (:obj:`Any`): the given action to step with. + Returns: + - obs (:obj:`Dict`) : the wrapped observation, which including the current obs, \ + previous action and previous reward. + - reward (:obj:`Any`) : amount of reward returned after previous action + - done (:obj:`Bool`) : whether the episode has ended, in which case further \ + step() calls will return undefined results + - info (:obj:`Dict`) : contains auxiliary diagnostic information (helpful \ + for debugging, and sometimes learning) + """ + + obs, reward, done, truncated, info = self.env.step(action) + obs = {'obs': obs, 'prev_action': self.prev_action, 'prev_reward_extrinsic': self.prev_reward_extrinsic} + self.prev_action = action + self.prev_reward_extrinsic = reward + return obs, reward, done, truncated, info diff --git a/dizoo/minigrid/utils/eval.py b/dizoo/minigrid/utils/eval.py index e8e4f728fa..e3c6acb9fb 100644 --- a/dizoo/minigrid/utils/eval.py +++ b/dizoo/minigrid/utils/eval.py @@ -8,11 +8,11 @@ def eval( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - model: Optional[torch.nn.Module] = None, - state_dict: Optional[dict] = None, - replay_path: Optional[str] = './video', + input_cfg: Union[str, Tuple[dict, dict]], + seed: int = 0, + model: Optional[torch.nn.Module] = None, + state_dict: Optional[dict] = None, + replay_path: Optional[str] = './video', ) -> float: r""" Overview: diff --git a/dizoo/mujoco/config/ant_gail_sac_config.py b/dizoo/mujoco/config/ant_gail_sac_config.py index b7e7cd7d06..7b268a8667 100644 --- a/dizoo/mujoco/config/ant_gail_sac_config.py +++ b/dizoo/mujoco/config/ant_gail_sac_config.py @@ -15,7 +15,7 @@ ), reward_model=dict( input_size=obs_shape + act_shape, - hidden_size=256, + hidden_size_list=[256], batch_size=64, learning_rate=1e-3, update_per_collect=100, @@ -88,14 +88,17 @@ # or you can enter `ding -m serial_gail -c ant_gail_sac_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. hopper_sac_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.mujoco.config.ant_sac_config import ant_sac_config, ant_sac_create_config - expert_main_config = ant_sac_config - expert_create_config = ant_sac_create_config - serial_pipeline_gail( - [main_config, create_config], [expert_main_config, expert_create_config], - max_env_step=10000000, - seed=0, - collect_data=True + # set your expert config here + expert_cfg = (ant_sac_config, ant_sac_create_config) + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, expert_data_path=expert_data_path, collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/mujoco/config/ant_trex_onppo_config.py b/dizoo/mujoco/config/ant_trex_onppo_config.py index f3d5e96b75..8bdf8bbb2a 100644 --- a/dizoo/mujoco/config/ant_trex_onppo_config.py +++ b/dizoo/mujoco/config/ant_trex_onppo_config.py @@ -75,6 +75,15 @@ create_config = ant_trex_ppo_create_config if __name__ == "__main__": - # or you can enter `ding -m serial -c ant_trex_onppo_config.py -s 0` - from ding.entry import serial_pipeline_trex_onpolicy - serial_pipeline_trex_onpolicy((main_config, create_config), seed=0) + import argparse + import torch + from ding.entry import trex_collecting_data + from ding.entry import serial_pipeline_reward_model_onpolicy + parser = argparse.ArgumentParser() + parser.add_argument('--cfg', type=str, default='please enter abs path for this file') + parser.add_argument('--seed', type=int, default=0) + parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') + args = parser.parse_args() + # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. + trex_collecting_data(args) + serial_pipeline_reward_model_onpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/ant_trex_sac_config.py b/dizoo/mujoco/config/ant_trex_sac_config.py index 6c0ef73097..63aa6b6808 100644 --- a/dizoo/mujoco/config/ant_trex_sac_config.py +++ b/dizoo/mujoco/config/ant_trex_sac_config.py @@ -85,6 +85,15 @@ create_config = ant_trex_sac_create_config if __name__ == "__main__": - # or you can enter `ding -m serial -c ant_trex_sac_config.py -s 0` - from ding.entry import serial_pipeline_trex - serial_pipeline_trex((main_config, create_config), seed=0) + import argparse + import torch + from ding.entry import trex_collecting_data + from ding.entry import serial_pipeline_reward_model_offpolicy + parser = argparse.ArgumentParser() + parser.add_argument('--cfg', type=str, default='please enter abs path for this file') + parser.add_argument('--seed', type=int, default=0) + parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') + args = parser.parse_args() + # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. + trex_collecting_data(args) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/halfcheetah_bdq_config.py b/dizoo/mujoco/config/halfcheetah_bdq_config.py index 145bf8062e..25fb65ba35 100644 --- a/dizoo/mujoco/config/halfcheetah_bdq_config.py +++ b/dizoo/mujoco/config/halfcheetah_bdq_config.py @@ -22,7 +22,6 @@ action_bins_per_branch=2, # mean the action shape is 6, 2 discrete actions for each action dimension encoder_hidden_size_list=[256, 256, 128], ), - learn=dict( batch_size=512, learning_rate=3e-4, @@ -65,4 +64,8 @@ if __name__ == "__main__": # or you can enter `ding -m serial_onpolicy -c halfcheetah_onbdq_config.py -s 0` from ding.entry import serial_pipeline - serial_pipeline((main_config, create_config), seed=0, max_env_step=10000000,) \ No newline at end of file + serial_pipeline( + (main_config, create_config), + seed=0, + max_env_step=10000000, + ) diff --git a/dizoo/mujoco/config/halfcheetah_gail_sac_config.py b/dizoo/mujoco/config/halfcheetah_gail_sac_config.py index bf64cd8c64..ff62473289 100644 --- a/dizoo/mujoco/config/halfcheetah_gail_sac_config.py +++ b/dizoo/mujoco/config/halfcheetah_gail_sac_config.py @@ -15,7 +15,7 @@ ), reward_model=dict( input_size=obs_shape + act_shape, - hidden_size=256, + hidden_size_list=[256], batch_size=64, learning_rate=1e-3, update_per_collect=100, @@ -87,14 +87,17 @@ # or you can enter `ding -m serial_gail -c ant_gail_sac_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. hopper_sac_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.mujoco.config.halfcheetah_sac_config import halfcheetah_sac_config, halfcheetah_sac_create_config - expert_main_config = halfcheetah_sac_config - expert_create_config = halfcheetah_sac_create_config - serial_pipeline_gail( - [main_config, create_config], [expert_main_config, expert_create_config], - max_env_step=10000000, - seed=0, - collect_data=True + # set your expert config here + expert_cfg = (halfcheetah_sac_config, halfcheetah_sac_create_config) + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, expert_data_path=expert_data_path, collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/mujoco/config/halfcheetah_gcl_sac_config.py b/dizoo/mujoco/config/halfcheetah_gcl_sac_config.py index 367b7bcf03..05781959c0 100644 --- a/dizoo/mujoco/config/halfcheetah_gcl_sac_config.py +++ b/dizoo/mujoco/config/halfcheetah_gcl_sac_config.py @@ -17,7 +17,17 @@ batch_size=32, action_shape=6, continuous=True, + # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. + # Absolute path is recommended. + # In DI-engine, it is usually located in ``exp_name`` directory + # e.g. 'exp_name/expert_data.pkl' + expert_data_path='halfcheetah_sac_seed0/expert_data.pkl', + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. + expert_model_path='halfcheetah_sac_seed0/ckpt/ckpt_best.pth.tar', update_per_collect=20, + collect_count=300000, ), policy=dict( cuda=False, @@ -45,13 +55,6 @@ auto_alpha=False, ), collect=dict( - # Users should add their own model path here. Model path should lead to a model. - # Absolute path is recommended. - # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. - model_path='model_path_placeholder', - # If you need the data collected by the collector to contain logit key which reflect the probability of - # the action, you can change the key to be True. - # In Guided cost Learning, we need to use logit to train the reward model, we change the key to be True. collector_logit=True, n_sample=256, unroll_len=1, @@ -82,5 +85,18 @@ create_config = halfcheetah_gcl_sac_create_config if __name__ == '__main__': - from ding.entry import serial_pipeline_guided_cost - serial_pipeline_guided_cost((main_config, create_config), seed=0) + from ding.entry import collect_demo_data, serial_pipeline_reward_model_offpolicy + from dizoo.mujoco.config.halfcheetah_sac_config import halfcheetah_sac_config, halfcheetah_sac_create_config + + expert_cfg = (halfcheetah_sac_config, halfcheetah_sac_create_config) + expert_data_path = main_config.reward_model.expert_data_path + state_dict_path = main_config.reward_model.expert_model_path + collect_count = main_config.reward_model.collect_count + collect_demo_data( + expert_cfg, + seed=0, + state_dict_path=state_dict_path, + expert_data_path=expert_data_path, + collect_count=collect_count + ) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py b/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py index 6d635c212d..867b4381e8 100644 --- a/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py +++ b/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py @@ -12,6 +12,8 @@ stop_value=3000, ), reward_model=dict( + type='trex', + exp_name='halfcheetah_trex_onppo_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=10000, @@ -24,14 +26,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /HalfCheetah.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + expert_model_path='halfcheetah_onppo_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=17, + action_shape=6, ), policy=dict( cuda=True, @@ -90,7 +88,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex_onpolicy + from ding.entry import serial_pipeline_reward_model_onpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -98,4 +96,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex_onpolicy([main_config, create_config]) + serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/halfcheetah_trex_sac_config.py b/dizoo/mujoco/config/halfcheetah_trex_sac_config.py index 5f123682a0..ecacdcf0d4 100644 --- a/dizoo/mujoco/config/halfcheetah_trex_sac_config.py +++ b/dizoo/mujoco/config/halfcheetah_trex_sac_config.py @@ -12,6 +12,8 @@ stop_value=12000, ), reward_model=dict( + type='trex', + exp_name='halfcheetah_trex_sac_seed0', learning_rate=1e-5, min_snippet_length=30, max_snippet_length=100, @@ -23,14 +25,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /HalfCheetah.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + expert_model_path='halfcheetah_sac_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=17, + action_shape=6, ), policy=dict( cuda=True, @@ -92,7 +90,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex + from ding.entry import serial_pipeline_reward_model_onpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -100,4 +98,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex([main_config, create_config]) + serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/hopper_bdq_config.py b/dizoo/mujoco/config/hopper_bdq_config.py index de08da2a7a..34dbe21664 100644 --- a/dizoo/mujoco/config/hopper_bdq_config.py +++ b/dizoo/mujoco/config/hopper_bdq_config.py @@ -68,4 +68,8 @@ if __name__ == "__main__": # or you can enter `ding -m serial_onpolicy -c hopper_bdq_config.py -s 0` from ding.entry import serial_pipeline - serial_pipeline([main_config, create_config], seed=0, max_env_step=10000000,) + serial_pipeline( + [main_config, create_config], + seed=0, + max_env_step=10000000, + ) diff --git a/dizoo/mujoco/config/hopper_gail_sac_config.py b/dizoo/mujoco/config/hopper_gail_sac_config.py index 26ef8b3816..f0fcdd4515 100644 --- a/dizoo/mujoco/config/hopper_gail_sac_config.py +++ b/dizoo/mujoco/config/hopper_gail_sac_config.py @@ -15,20 +15,20 @@ ), reward_model=dict( input_size=obs_shape + act_shape, - hidden_size=256, + hidden_size_list=[256], batch_size=64, learning_rate=1e-3, update_per_collect=100, # Users should add their own model path here. Model path should lead to a model. # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. - expert_model_path='model_path_placeholder', + expert_model_path='hopper_sac_seed0/ckpt/ckpt_best.pth.tar', # Path where to store the reward model - reward_model_path='data_path_placeholder+/reward_model/ckpt/ckpt_best.pth.tar', + reward_model_path='hopper_gail_sac_seed0/reward_model/ckpt/ckpt_best.pth.tar', # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. # Absolute path is recommended. # In DI-engine, it is usually located in ``exp_name`` directory - data_path='data_path_placeholder', + data_path='hopper_sac_seed0', collect_count=100000, ), policy=dict( @@ -88,13 +88,21 @@ # or you can enter `ding -m serial_gail -c hopper_gail_sac_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. hopper_sac_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.mujoco.config.hopper_sac_config import hopper_sac_config, hopper_sac_create_config + # set expert config from policy config in dizoo + expert_cfg = (hopper_sac_config, hopper_sac_create_config) expert_main_config = hopper_sac_config - expert_create_config = hopper_sac_create_config - serial_pipeline_gail( - [main_config, create_config], [expert_main_config, expert_create_config], - max_env_step=1000000, + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, - collect_data=True + state_dict_path=main_config.reward_model.expert_model_path, + expert_data_path=expert_data_path, + collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/mujoco/config/hopper_gcl_config.py b/dizoo/mujoco/config/hopper_gcl_config.py index 214f44dbf7..d7299367c1 100644 --- a/dizoo/mujoco/config/hopper_gcl_config.py +++ b/dizoo/mujoco/config/hopper_gcl_config.py @@ -17,7 +17,17 @@ batch_size=32, action_shape=3, continuous=True, + # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. + # Absolute path is recommended. + # In DI-engine, it is usually located in ``exp_name`` directory + # e.g. 'exp_name/expert_data.pkl' + expert_data_path='hopper_sac_seed0/expert_data.pkl', + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. + expert_model_path='hopper_sac_seed0/ckpt/ckpt_best.pth.tar', update_per_collect=20, + collect_count=100000, ), policy=dict( cuda=False, @@ -38,13 +48,6 @@ adv_norm=True, ), collect=dict( - # Users should add their own model path here. Model path should lead to a model. - # Absolute path is recommended. - # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. - model_path='model_path_placeholder', - # If you need the data collected by the collector to contain logit key which reflect the probability of - # the action, you can change the key to be True. - # In Guided cost Learning, we need to use logit to train the reward model, we change the key to be True. collector_logit=True, n_sample=2048, unroll_len=1, @@ -70,5 +73,18 @@ create_config = hopper_gcl_create_config if __name__ == '__main__': - from ding.entry import serial_pipeline_guided_cost - serial_pipeline_guided_cost((main_config, create_config), seed=0) + from ding.entry import collect_demo_data, serial_pipeline_reward_model_offpolicy + from dizoo.mujoco.config.hopper_sac_config import hopper_sac_config, hopper_sac_create_config + + expert_cfg = (hopper_sac_config, hopper_sac_create_config) + expert_data_path = main_config.reward_model.expert_data_path + state_dict_path = main_config.reward_model.expert_model_path + collect_count = main_config.reward_model.collect_count + collect_demo_data( + expert_cfg, + seed=0, + state_dict_path=state_dict_path, + expert_data_path=expert_data_path, + collect_count=collect_count + ) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/mujoco/config/hopper_trex_onppo_config.py b/dizoo/mujoco/config/hopper_trex_onppo_config.py index e69451fe3c..5e375d668c 100644 --- a/dizoo/mujoco/config/hopper_trex_onppo_config.py +++ b/dizoo/mujoco/config/hopper_trex_onppo_config.py @@ -12,6 +12,8 @@ stop_value=3000, ), reward_model=dict( + type='trex', + exp_name='hopper_trex_onppo_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=10000, @@ -24,14 +26,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /Hopper.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + expert_model_path='hopper_onppo_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=11, + action_shape=3, ), policy=dict( cuda=True, @@ -71,6 +69,7 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ppo', ), + reward_model=dict(type='trex', ), ) hopper_trex_onppo_create_config = EasyDict(hopper_trex_onppo_create_config) create_config = hopper_trex_onppo_create_config @@ -82,7 +81,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex_onpolicy + from ding.entry import serial_pipeline_reward_model_onpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -90,4 +89,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex_onpolicy([main_config, create_config]) + serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/hopper_trex_sac_config.py b/dizoo/mujoco/config/hopper_trex_sac_config.py index 5c4aa6f2c1..8dece97ca0 100644 --- a/dizoo/mujoco/config/hopper_trex_sac_config.py +++ b/dizoo/mujoco/config/hopper_trex_sac_config.py @@ -12,6 +12,8 @@ stop_value=6000, ), reward_model=dict( + type='trex', + exp_name='hopper_trex_sac_seed0', learning_rate=1e-5, min_snippet_length=30, max_snippet_length=100, @@ -23,14 +25,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /Hopper.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + expert_model_path='hopper_sac_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=11, + action_shape=3, ), policy=dict( cuda=True, @@ -80,6 +78,7 @@ import_names=['ding.policy.sac'], ), replay_buffer=dict(type='naive', ), + reward_model=dict(type='trex', ), ) hopper_trex_sac_create_config = EasyDict(hopper_trex_sac_create_config) create_config = hopper_trex_sac_create_config @@ -91,7 +90,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -99,4 +98,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex([main_config, create_config]) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/walker2d_gail_ddpg_config.py b/dizoo/mujoco/config/walker2d_gail_ddpg_config.py index 779f65f63b..d4370349dc 100644 --- a/dizoo/mujoco/config/walker2d_gail_ddpg_config.py +++ b/dizoo/mujoco/config/walker2d_gail_ddpg_config.py @@ -87,13 +87,17 @@ # or you can enter `ding -m serial_gail -c walker2d_gail_ddpg_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. walker2d_ddpg_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.mujoco.config.walker2d_ddpg_config import walker2d_ddpg_config, walker2d_ddpg_create_config - expert_main_config = walker2d_ddpg_config - expert_create_config = walker2d_ddpg_create_config - serial_pipeline_gail( - [main_config, create_config], [expert_main_config, expert_create_config], - max_env_step=1000000, - seed=0, - collect_data=True + + # set your expert config here + expert_cfg = (walker2d_ddpg_config, walker2d_ddpg_create_config) + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, expert_data_path=expert_data_path, collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy((main_config, create_config)) \ No newline at end of file diff --git a/dizoo/mujoco/config/walker2d_gail_sac_config.py b/dizoo/mujoco/config/walker2d_gail_sac_config.py index 7bd2de9022..a075c32299 100644 --- a/dizoo/mujoco/config/walker2d_gail_sac_config.py +++ b/dizoo/mujoco/config/walker2d_gail_sac_config.py @@ -88,14 +88,17 @@ # or you can enter `ding -m serial_gail -c ant_gail_sac_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. hopper_sac_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.mujoco.config.walker2d_sac_config import walker2d_sac_config, walker2d_sac_create_config - expert_main_config = walker2d_sac_config - expert_create_config = walker2d_sac_create_config - serial_pipeline_gail( - [main_config, create_config], [expert_main_config, expert_create_config], - max_env_step=5000000, - seed=0, - collect_data=True + # set your expert config here + expert_cfg = (walker2d_sac_config, walker2d_sac_create_config) + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, expert_data_path=expert_data_path, collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/mujoco/config/walker2d_gcl_config.py b/dizoo/mujoco/config/walker2d_gcl_config.py index 1b0b56fa32..b741359067 100644 --- a/dizoo/mujoco/config/walker2d_gcl_config.py +++ b/dizoo/mujoco/config/walker2d_gcl_config.py @@ -17,7 +17,17 @@ batch_size=32, action_shape=6, continuous=True, + # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. + # Absolute path is recommended. + # In DI-engine, it is usually located in ``exp_name`` directory + # e.g. 'exp_name/expert_data.pkl' + expert_data_path='walker2d_sac_seed0/expert_data.pkl', + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. + expert_model_path='walker2d_sac_seed0/ckpt/ckpt_best.pth.tar', update_per_collect=20, + collect_count=100000, ), policy=dict( cuda=False, @@ -38,13 +48,6 @@ adv_norm=True, ), collect=dict( - # Users should add their own model path here. Model path should lead to a model. - # Absolute path is recommended. - # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. - model_path='model_path_placeholder', - # If you need the data collected by the collector to contain logit key which reflect the probability of - # the action, you can change the key to be True. - # In Guided cost Learning, we need to use logit to train the reward model, we change the key to be True. collector_logit=True, n_sample=2048, unroll_len=1, @@ -71,5 +74,18 @@ create_config = walker2d_gcl_create_config if __name__ == '__main__': - from ding.entry import serial_pipeline_guided_cost - serial_pipeline_guided_cost((main_config, create_config), seed=0) + from ding.entry import collect_demo_data, serial_pipeline_reward_model_offpolicy + from dizoo.mujoco.config.walker2d_sac_config import walker2d_sac_config, walker2d_sac_create_config + + expert_cfg = (walker2d_sac_config, walker2d_sac_create_config) + expert_data_path = main_config.reward_model.expert_data_path + state_dict_path = main_config.reward_model.expert_model_path + collect_count = main_config.reward_model.collect_count + collect_demo_data( + expert_cfg, + seed=0, + state_dict_path=state_dict_path, + expert_data_path=expert_data_path, + collect_count=collect_count + ) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/mujoco/config/walker2d_trex_onppo_config.py b/dizoo/mujoco/config/walker2d_trex_onppo_config.py index c53c1efb4b..bde35d48f3 100644 --- a/dizoo/mujoco/config/walker2d_trex_onppo_config.py +++ b/dizoo/mujoco/config/walker2d_trex_onppo_config.py @@ -12,6 +12,8 @@ stop_value=3000, ), reward_model=dict( + type='trex', + exp_name='walker2d_trex_onppo_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=10000, @@ -24,14 +26,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /Walker2d.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + expert_model_path='walker2d_onppo_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=17, + action_shape=6, ), policy=dict( cuda=True, @@ -82,7 +80,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex_onpolicy + from ding.entry import serial_pipeline_reward_model_onpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -90,4 +88,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex_onpolicy([main_config, create_config]) + serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/walker2d_trex_sac_config.py b/dizoo/mujoco/config/walker2d_trex_sac_config.py index fdd1cab65e..998f6dcbeb 100644 --- a/dizoo/mujoco/config/walker2d_trex_sac_config.py +++ b/dizoo/mujoco/config/walker2d_trex_sac_config.py @@ -12,6 +12,8 @@ stop_value=6000, ), reward_model=dict( + type='trex', + exp_name='walker2d_trex_sac_seed0', learning_rate=1e-5, min_snippet_length=30, max_snippet_length=100, @@ -23,14 +25,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /Walker2d.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + expert_model_path='walker2d_sac_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=17, + action_shape=6, ), policy=dict( cuda=True, @@ -91,7 +89,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -99,4 +97,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex([main_config, create_config]) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False)