Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(lisong): fix icm/rnd+onppo config bugs and app_key env bugs #564

Merged
merged 25 commits into from
Mar 5, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
0936556
polish(pu): polish icm_onppo_config
puyuan1996 Nov 21, 2022
c03fadd
polish(pu): polish icm rnd intrinsic_reward_weight and config
puyuan1996 Nov 29, 2022
c66a6df
Merge branch 'main' of https://github.com/opendilab/DI-engine into de…
puyuan1996 Nov 29, 2022
39365e1
style(pu): yapf format
puyuan1996 Nov 29, 2022
6a92682
Merge branch 'main' into dev-icm-onppo
puyuan1996 Dec 4, 2022
6b11aff
fix(lisong): fix config bugs and app_key env bugs
song2181 Dec 28, 2022
3907dd9
Merge https://github.com/opendilab/DI-engine into dev-icm-onppo
song2181 Dec 28, 2022
e9db602
Merge branch 'main' into dev-icm-onppo
PaParaZz1 Jan 2, 2023
0bf6f14
Merge branch 'main' into dev-icm-onppo
PaParaZz1 Jan 2, 2023
8d58e4c
Merge branch 'main' into dev-icm-onppo
puyuan1996 Jan 9, 2023
142fc44
polish(lisong): polish icm/rnd config and reward model
song2181 Jan 10, 2023
e2ae39c
Merge branch 'dev-icm-onppo' of github.com:song2181/DI-engine into de…
song2181 Jan 10, 2023
3731e02
fix(lisong): add viewsizerapper in minigrid_wrapper
song2181 Jan 11, 2023
c3a4710
Merge branch 'main' into dev-icm-onppo
puyuan1996 Jan 30, 2023
ab6a2a2
Merge branch 'main' into dev-icm-onppo
puyuan1996 Feb 7, 2023
d17f6b3
fix(lisong): add doorkey8x8 rnd+onppo config,save reward model, fix r…
song2181 Feb 9, 2023
84a19c4
Merge branch 'main' of https://github.com/opendilab/DI-engine into de…
song2181 Feb 9, 2023
9906d1b
Merge branch 'dev-icm-onppo' of github.com:song2181/DI-engine into de…
song2181 Feb 9, 2023
fb37768
Merge branch 'main' into dev-icm-onppo
puyuan1996 Feb 13, 2023
d8191c2
fix(pu): fix augmented_reward tb_logging
puyuan1996 Feb 13, 2023
be4fdc5
Merge branch 'dev-icm-onppo' of https://github.com/song2181/DI-engine…
puyuan1996 Feb 13, 2023
153b8db
feat(lisong): add noisy-tv env in minigrid
song2181 Feb 20, 2023
0fc5f93
Merge branch 'dev-icm-onppo' of https://github.com/puyuan1996/DI-engi…
song2181 Feb 20, 2023
1957316
Merge branch 'dev-icm-onppo' of github.com:song2181/DI-engine into de…
song2181 Feb 20, 2023
f6985c5
fix(lisong): modify noisy_tv env
song2181 Feb 23, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion ding/entry/serial_entry_reward_model_offpolicy.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def serial_pipeline_reward_model_offpolicy(
# Accumulate plenty of data at the beginning of training.
if cfg.policy.get('random_collect_size', 0) > 0:
random_collect(cfg.policy, policy, collector, collector_env, commander, replay_buffer)
count = 0
while True:
collect_kwargs = commander.step()
# Evaluate policy performance
Expand All @@ -103,7 +104,8 @@ def serial_pipeline_reward_model_offpolicy(
replay_buffer.push(new_data, cur_collector_envstep=collector.envstep)
# update reward_model
reward_model.train()
reward_model.clear_data()
if count % cfg.reward_model.clear_buffer_per_iters == 0:
reward_model.clear_data()
PaParaZz1 marked this conversation as resolved.
Show resolved Hide resolved
# Learn policy from collected data
for i in range(cfg.policy.learn.update_per_collect):
# Learner will train ``update_per_collect`` times in one iteration.
Expand All @@ -122,6 +124,7 @@ def serial_pipeline_reward_model_offpolicy(
replay_buffer.update(learner.priority_info)
if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter:
break
count += 1

# Learner's after_run hook.
learner.call_hook('after_run')
Expand Down
23 changes: 14 additions & 9 deletions ding/reward_model/icm_reward_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@ def _train(self) -> None:
def train(self) -> None:
PaParaZz1 marked this conversation as resolved.
Show resolved Hide resolved
for _ in range(self.cfg.update_per_collect):
self._train()
self.clear_data()

def estimate(self, data: list) -> List[Dict]:
# NOTE: deepcopy reward part of data is very important,
Expand All @@ -207,17 +206,23 @@ def estimate(self, data: list) -> List[Dict]:
actions = torch.cat(actions).to(self.device)
with torch.no_grad():
real_next_state_feature, pred_next_state_feature, _ = self.reward_model(states, next_states, actions)
reward = self.forward_mse(real_next_state_feature, pred_next_state_feature).mean(dim=1)
reward = (reward - reward.min()) / (reward.max() - reward.min() + 1e-8)
reward = reward.to(train_data_augmented[0]['reward'].device)
reward = torch.chunk(reward, reward.shape[0], dim=0)
for item, rew in zip(train_data_augmented, reward):
raw_icm_reward = self.forward_mse(real_next_state_feature, pred_next_state_feature).mean(dim=1)
icm_reward = (raw_icm_reward - raw_icm_reward.min()) / (raw_icm_reward.max() - raw_icm_reward.min() + 1e-8)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why norm twice here

icm_reward = icm_reward.to(train_data_augmented[0]['reward'].device)
PaParaZz1 marked this conversation as resolved.
Show resolved Hide resolved
icm_reward = torch.chunk(icm_reward, icm_reward.shape[0], dim=0)
PaParaZz1 marked this conversation as resolved.
Show resolved Hide resolved
for item, icm_rew in zip(train_data_augmented, icm_reward):
if self.intrinsic_reward_type == 'add':
item['reward'] += rew
if self.cfg.extrinsic_reward_norm:
item['reward'] = item[
'reward'] / self.cfg.extrinsic_reward_norm_max + icm_rew * self.cfg.intrinsic_reward_weight
else:
item['reward'] = item['reward'] + icm_rew * self.cfg.intrinsic_reward_weight
elif self.intrinsic_reward_type == 'new':
item['intrinsic_reward'] = rew
item['intrinsic_reward'] = icm_rew
if self.cfg.extrinsic_reward_norm:
item['reward'] = item['reward'] / self.cfg.extrinsic_reward_norm_max
elif self.intrinsic_reward_type == 'assign':
item['reward'] = rew
item['reward'] = icm_rew

return train_data_augmented

Expand Down
35 changes: 9 additions & 26 deletions ding/reward_model/rnd_reward_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from .base_reward_model import BaseRewardModel
from ding.utils import RunningMeanStd
from ding.torch_utils.data_helper import to_tensor
import copy


def collect_states(iterator):
Expand Down Expand Up @@ -60,19 +59,13 @@ class RndRewardModel(BaseRewardModel):
obs_norm=True,
obs_norm_clamp_min=-1,
obs_norm_clamp_max=1,
intrinsic_reward_weight=None,
# means the relative weight of RND intrinsic_reward.
# If intrinsic_reward_weight=None, we will automatically set it based on
# the absolute value of the difference between max and min extrinsic reward in the sampled mini-batch
# please refer to estimate() method for details.
intrinsic_reward_rescale=0.01,
# means the rescale value of RND intrinsic_reward only used when intrinsic_reward_weight is None
intrinsic_reward_weight=0.01,
PaParaZz1 marked this conversation as resolved.
Show resolved Hide resolved
)

def __init__(self, config: EasyDict, device: str = 'cpu', tb_logger: 'SummaryWriter' = None) -> None: # noqa
super(RndRewardModel, self).__init__()
self.cfg = config
self.intrinsic_reward_rescale = self.cfg.intrinsic_reward_rescale
assert device == "cpu" or device.startswith("cuda")
self.device = device
if tb_logger is None: # TODO
Expand Down Expand Up @@ -132,7 +125,7 @@ def estimate(self, data: list) -> List[Dict]:
self._running_mean_std_rnd_reward.update(mse.cpu().numpy())

# Note: according to the min-max normalization, transform rnd reward to [0,1]
rnd_reward = (mse - mse.min()) / (mse.max() - mse.min() + 1e-11)
rnd_reward = (mse - mse.min()) / (mse.max() - mse.min() + 1e-8)

self.estimate_cnt_rnd += 1
self.tb_logger.add_scalar('rnd_reward/rnd_reward_max', rnd_reward.max(), self.estimate_cnt_rnd)
Expand All @@ -148,27 +141,17 @@ def estimate(self, data: list) -> List[Dict]:
# rewards = torch.stack([data[i]['reward'] for i in range(len(data))])
# rewards = (rewards - torch.min(rewards)) / (torch.max(rewards) - torch.min(rewards))

# TODO(pu): how to set intrinsic_reward_rescale automatically?
if self.cfg.intrinsic_reward_weight is None:
"""
NOTE: the following way of setting self.cfg.intrinsic_reward_weight is only suitable for the dense
reward env like lunarlander, not suitable for the dense reward env.
In sparse reward env, e.g. minigrid, if the agent reaches the goal, it obtain reward ~1, otherwise 0.
Thus, in sparse reward env, it's reasonable to set the intrinsic_reward_weight approximately equal to
the inverse of max_episode_steps.
"""
self.cfg.intrinsic_reward_weight = self.intrinsic_reward_rescale * max(
1,
abs(
max([train_data_augmented[i]['reward'] for i in range(len(train_data_augmented))]) -
min([train_data_augmented[i]['reward'] for i in range(len(train_data_augmented))])
)
)
for item, rnd_rew in zip(train_data_augmented, rnd_reward):
if self.intrinsic_reward_type == 'add':
item['reward'] = item['reward'] + rnd_rew * self.cfg.intrinsic_reward_weight
if self.cfg.extrinsic_reward_norm:
item['reward'] = item[
'reward'] / self.cfg.extrinsic_reward_norm_max + rnd_rew * self.cfg.intrinsic_reward_weight
else:
item['reward'] = item['reward'] + rnd_rew * self.cfg.intrinsic_reward_weight
elif self.intrinsic_reward_type == 'new':
item['intrinsic_reward'] = rnd_rew
if self.cfg.extrinsic_reward_norm:
item['reward'] = item['reward'] / self.cfg.extrinsic_reward_norm_max
elif self.intrinsic_reward_type == 'assign':
item['reward'] = rnd_rew

Expand Down
2 changes: 1 addition & 1 deletion dizoo/minigrid/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from gym.envs.registration import register
from gymnasium.envs.registration import register
PaParaZz1 marked this conversation as resolved.
Show resolved Hide resolved

register(id='MiniGrid-AKTDT-7x7-1-v0', entry_point='dizoo.minigrid.envs:AppleKeyToDoorTreasure_7x7_1')

Expand Down
59 changes: 39 additions & 20 deletions dizoo/minigrid/config/minigrid_icm_offppo_config.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,83 @@
from easydict import EasyDict

minigrid_ppo_icm_config = dict(
exp_name='minigrid_icm_offppo_seed0',
minigrid_icm_offppo_config = dict(
exp_name='minigrid_fourroom_icm_offppo_seed0',
env=dict(
collector_env_num=8,
evaluator_env_num=5,
n_evaluator_episode=5,
# typical MiniGrid env id:
# {'MiniGrid-Empty-8x8-v0', 'MiniGrid-FourRooms-v0', 'MiniGrid-DoorKey-8x8-v0','MiniGrid-DoorKey-16x16-v0'},
# please refer to https://github.com/Farama-Foundation/MiniGrid for details.
env_id='MiniGrid-DoorKey-8x8-v0',
max_step=300,
# minigrid env id: 'MiniGrid-Empty-8x8-v0', 'MiniGrid-FourRooms-v0','MiniGrid-DoorKey-16x16-v0','MiniGrid-AKTDT-7x7-1-v0'
env_id='MiniGrid-AKTDT-7x7-1-v0',
max_step=100,
stop_value=0.96,
# stop_value=12, # run fixed env_steps for MiniGrid-AKTDT-7x7-1-v0
),
reward_model=dict(
intrinsic_reward_type='add',
learning_rate=0.001,
obs_shape=2739,
# intrinsic_reward_weight means the relative weight of RND intrinsic_reward.
# Specifically for sparse reward env MiniGrid, in this env,
# if reach goal, the agent get reward ~1, otherwise 0,
# We could set the intrinsic_reward_weight approximately equal to the inverse of max_episode_steps.
# Please refer to rnd_reward_model for details.
intrinsic_reward_weight=0.001,
learning_rate=3e-4,
obs_shape=2835,
batch_size=320,
update_per_collect=10,
update_per_collect=50,
clear_buffer_per_iters=int(1e3),
obs_norm=True,
obs_norm_clamp_max=5,
obs_norm_clamp_min=-5,
extrinsic_reward_norm=True,
extrinsic_reward_norm_max=1,
),
policy=dict(
cuda=True,
recompute_adv=True,
action_space='discrete',
model=dict(
obs_shape=2835,
action_shape=7,
action_space='discrete',
encoder_hidden_size_list=[256, 128, 64, 64],
critic_head_hidden_size=64,
actor_head_hidden_size=64,
),
learn=dict(
update_per_collect=10,
epoch_per_collect=10,
PaParaZz1 marked this conversation as resolved.
Show resolved Hide resolved
update_per_collect=1,
batch_size=320,
learning_rate=0.0003,
learning_rate=3e-4,
value_weight=0.5,
entropy_weight=0.001,
clip_ratio=0.2,
adv_norm=False,
adv_norm=True,
value_norm=True,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

offppo doesn't have value norm

),
collect=dict(
n_sample=3200,
unroll_len=1,
discount_factor=0.99,
gae_lambda=0.95,
),
eval=dict(evaluator=dict(eval_freq=200, )),
),
)
minigrid_ppo_icm_config = EasyDict(minigrid_ppo_icm_config)
main_config = minigrid_ppo_icm_config
minigrid_ppo_icm_create_config = dict(
minigrid_icm_offppo_config = EasyDict(minigrid_icm_offppo_config)
main_config = minigrid_icm_offppo_config
minigrid_icm_offppo_create_config = dict(
env=dict(
type='minigrid',
import_names=['dizoo.minigrid.envs.minigrid_env'],
),
env_manager=dict(type='subprocess'),
env_manager=dict(type='base'),
PaParaZz1 marked this conversation as resolved.
Show resolved Hide resolved
policy=dict(type='ppo_offpolicy'),
reward_model=dict(type='icm'),
)
minigrid_ppo_icm_create_config = EasyDict(minigrid_ppo_icm_create_config)
create_config = minigrid_ppo_icm_create_config
minigrid_icm_offppo_create_config = EasyDict(minigrid_icm_offppo_create_config)
create_config = minigrid_icm_offppo_create_config

if __name__ == "__main__":
# or you can enter `ding -m serial -c minigrid_icm_config.py -s 0`
from ding.entry import serial_pipeline_reward_model_offpolicy
serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0)
serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0, max_env_step=int(10e6))
85 changes: 85 additions & 0 deletions dizoo/minigrid/config/minigrid_icm_onppo_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from easydict import EasyDict

collector_env_num = 8
evaluator_env_num = 5
minigrid_icm_onppo_config = dict(
exp_name='minigrid_fourroom_icm_onppo_seed0',
env=dict(
collector_env_num=collector_env_num,
evaluator_env_num=evaluator_env_num,
n_evaluator_episode=evaluator_env_num,
# minigrid env id: 'MiniGrid-Empty-8x8-v0', 'MiniGrid-FourRooms-v0','MiniGrid-DoorKey-16x16-v0'
env_id='MiniGrid-FourRooms-v0',
max_step=300,
stop_value=2, # run fixed env_steps
# stop_value=0.96,
),
reward_model=dict(
intrinsic_reward_type='add',
# intrinsic_reward_weight means the relative weight of ICM intrinsic_reward.
# Specifically for sparse reward env MiniGrid, in this env,
# if reach goal, the agent get reward ~1, otherwise 0,
# We could set the intrinsic_reward_weight approximately equal to the inverse of max_episode_steps.
# Please refer to rnd_reward_model for details.
intrinsic_reward_weight=0.003, # 1/300
learning_rate=3e-4,
obs_shape=2835,
batch_size=320,
update_per_collect=50,
clear_buffer_per_iters=int(1e3),
obs_norm=True,
obs_norm_clamp_max=5,
obs_norm_clamp_min=-5,
extrinsic_reward_norm=True,
extrinsic_reward_norm_max=1,
),
policy=dict(
cuda=True,
recompute_adv=True,
action_space='discrete',
model=dict(
obs_shape=2835,
action_shape=7,
action_space='discrete',
encoder_hidden_size_list=[256, 128, 64, 64],
critic_head_hidden_size=64,
actor_head_hidden_size=64,
),
learn=dict(
epoch_per_collect=10,
update_per_collect=1,
batch_size=320,
learning_rate=3e-4,
value_weight=0.5,
entropy_weight=0.001,
clip_ratio=0.2,
adv_norm=True,
value_norm=True,
),
collect=dict(
n_sample=3200,
unroll_len=1,
discount_factor=0.99,
gae_lambda=0.95,
),
eval=dict(evaluator=dict(eval_freq=1000, )),
),
)
minigrid_icm_onppo_config = EasyDict(minigrid_icm_onppo_config)
main_config = minigrid_icm_onppo_config
minigrid_icm_onppo_create_config = dict(
env=dict(
type='minigrid',
import_names=['dizoo.minigrid.envs.minigrid_env'],
),
env_manager=dict(type='subprocess'),
policy=dict(type='ppo'),
reward_model=dict(type='icm'),
)
minigrid_icm_onppo_create_config = EasyDict(minigrid_icm_onppo_create_config)
create_config = minigrid_icm_onppo_create_config

if __name__ == "__main__":
# or you can enter `ding -m serial -c minigrid_icm_onppo_config.py -s 0`
from ding.entry import serial_pipeline_reward_model_onpolicy
serial_pipeline_reward_model_onpolicy([main_config, create_config], seed=0, max_env_step=int(10e6))
1 change: 0 additions & 1 deletion dizoo/minigrid/config/minigrid_onppo_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
),
env_manager=dict(type='base'),
# env_manager=dict(type='subprocess'),

policy=dict(type='ppo'),
)
minigrid_ppo_create_config = EasyDict(minigrid_ppo_create_config)
Expand Down
Loading