feature(pu): add efficientzero tictactoe configs (#204)

opendilab · Apr 8, 2024 · 46afa69 · 46afa69
1 parent 52ba3f9
commit 46afa69
Show file tree

Hide file tree

Showing 2 changed files with 177 additions and 0 deletions.
diff --git a/zoo/board_games/tictactoe/config/tictactoe_efficientzero_bot_mode_config.py b/zoo/board_games/tictactoe/config/tictactoe_efficientzero_bot_mode_config.py
@@ -0,0 +1,89 @@
+from easydict import EasyDict
+
+# ==============================================================
+# begin of the most frequently changed config specified by the user
+# ==============================================================
+collector_env_num = 8
+n_episode = 8
+evaluator_env_num = 5
+num_simulations = 25
+update_per_collect = 50
+batch_size = 256
+max_env_step = int(2e5)
+reanalyze_ratio = 0.
+# ==============================================================
+# end of the most frequently changed config specified by the user
+# ==============================================================
+
+tictactoe_efficientzero_config = dict(
+    exp_name=f'data_ez_ctree/tictactoe_efficientzero_bot-mode_ns{num_simulations}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0',
+    env=dict(
+        battle_mode='play_with_bot_mode',
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+        n_evaluator_episode=evaluator_env_num,
+        manager=dict(shared_memory=False, ),
+    ),
+    policy=dict(
+        model=dict(
+            observation_shape=(3, 3, 3),
+            action_space_size=9,
+            image_channel=3,
+            # We use the small size model for tictactoe.
+            num_res_blocks=1,
+            num_channels=16,
+            fc_reward_layers=[8],
+            fc_value_layers=[8],
+            fc_policy_layers=[8],
+            support_scale=10,
+            reward_support_size=21,
+            value_support_size=21,
+            norm_type='BN',
+            downsample=False,
+            discrete_action_encoding_type='one_hot',
+        ),
+        cuda=True,
+        env_type='board_games',
+        action_type='varied_action_space',
+        use_augmentation=False,
+        game_segment_length=5,
+        update_per_collect=update_per_collect,
+        batch_size=batch_size,
+        optim_type='Adam',
+        lr_piecewise_constant_decay=False,
+        learning_rate=0.003,
+        grad_clip_value=0.5,
+        num_simulations=num_simulations,
+        reanalyze_ratio=reanalyze_ratio,
+        # NOTE：In board_games, we set large td_steps to make sure the value target is the final outcome.
+        td_steps=9,
+        num_unroll_steps=3,
+        # NOTE：In board_games, we set discount_factor=1.
+        discount_factor=1,
+        n_episode=n_episode,
+        eval_freq=int(2e3),
+        replay_buffer_size=int(1e4),
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+    ),
+)
+tictactoe_efficientzero_config = EasyDict(tictactoe_efficientzero_config)
+main_config = tictactoe_efficientzero_config
+
+tictactoe_efficientzero_create_config = dict(
+    env=dict(
+        type='tictactoe',
+        import_names=['zoo.board_games.tictactoe.envs.tictactoe_env'],
+    ),
+    env_manager=dict(type='subprocess'),
+    policy=dict(
+        type='efficientzero',
+        import_names=['lzero.policy.efficientzero'],
+    ),
+)
+tictactoe_efficientzero_create_config = EasyDict(tictactoe_efficientzero_create_config)
+create_config = tictactoe_efficientzero_create_config
+
+if __name__ == "__main__":
+    from lzero.entry import train_muzero
+    train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)
diff --git a/zoo/board_games/tictactoe/config/tictactoe_efficientzero_sp_mode_config.py b/zoo/board_games/tictactoe/config/tictactoe_efficientzero_sp_mode_config.py
@@ -0,0 +1,88 @@
+from easydict import EasyDict
+
+# ==============================================================
+# begin of the most frequently changed config specified by the user
+# ==============================================================
+collector_env_num = 8
+n_episode = 8
+evaluator_env_num = 5
+num_simulations = 25
+update_per_collect = 50
+batch_size = 256
+max_env_step = int(2e5)
+reanalyze_ratio = 0.
+# ==============================================================
+# end of the most frequently changed config specified by the user
+# ==============================================================
+
+tictactoe_efficientzero_config = dict(
+    exp_name=f'data_ez_ctree/tictactoe_efficientzero_sp-mode_ns{num_simulations}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0',
+    env=dict(
+        battle_mode='self_play_mode',
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+        n_evaluator_episode=evaluator_env_num,
+        manager=dict(shared_memory=False, ),
+    ),
+    policy=dict(
+        model=dict(
+            observation_shape=(3, 3, 3),
+            action_space_size=9,
+            image_channel=3,
+            # We use the small size model for tictactoe.
+            num_res_blocks=1,
+            num_channels=16,
+            fc_reward_layers=[8],
+            fc_value_layers=[8],
+            fc_policy_layers=[8],
+            support_scale=10,
+            reward_support_size=21,
+            value_support_size=21,
+            downsample=False,
+            discrete_action_encoding_type='one_hot',
+        ),
+        cuda=True,
+        env_type='board_games',
+        action_type='varied_action_space',
+        use_augmentation=False,
+        game_segment_length=9,
+        update_per_collect=update_per_collect,
+        batch_size=batch_size,
+        optim_type='Adam',
+        lr_piecewise_constant_decay=False,
+        learning_rate=0.003,
+        grad_clip_value=0.5,
+        num_simulations=num_simulations,
+        reanalyze_ratio=reanalyze_ratio,
+        # NOTE：In board_games, we set large td_steps to make sure the value target is the final outcome.
+        td_steps=9,
+        num_unroll_steps=3,
+        # NOTE：In board_games, we set discount_factor=1.
+        discount_factor=1,
+        n_episode=n_episode,
+        eval_freq=int(2e3),
+        replay_buffer_size=int(1e4),
+        collector_env_num=collector_env_num,
+        evaluator_env_num=evaluator_env_num,
+    ),
+)
+tictactoe_efficientzero_config = EasyDict(tictactoe_efficientzero_config)
+main_config = tictactoe_efficientzero_config
+
+tictactoe_efficientzero_create_config = dict(
+    env=dict(
+        type='tictactoe',
+        import_names=['zoo.board_games.tictactoe.envs.tictactoe_env'],
+    ),
+    env_manager=dict(type='subprocess'),
+    policy=dict(
+        type='efficientzero',
+        import_names=['lzero.policy.efficientzero'],
+    ),
+)
+tictactoe_efficientzero_create_config = EasyDict(tictactoe_efficientzero_create_config)
+create_config = tictactoe_efficientzero_create_config
+
+if __name__ == "__main__":
+    from lzero.entry import train_muzero
+    train_muzero([main_config, create_config], seed=0, max_env_step=max_env_step)