diff --git a/pydeeprecsys/rl/agents/actor_critic.py b/pydeeprecsys/rl/agents/actor_critic.py index 22e0424..68e0c73 100644 --- a/pydeeprecsys/rl/agents/actor_critic.py +++ b/pydeeprecsys/rl/agents/actor_critic.py @@ -1,5 +1,5 @@ from pydeeprecsys.rl.agents.agent import ReinforcementLearning -from typing import Any, List +from typing import Any, List, Optional from pydeeprecsys.rl.experience_replay.experience_buffer import ExperienceReplayBuffer from pydeeprecsys.rl.experience_replay.buffer_parameters import ( ExperienceReplayBufferParameters, @@ -12,27 +12,34 @@ class ActorCriticAgent(ReinforcementLearning): """Policy estimator using a value estimator as a baseline. It's on-policy, for discrete action spaces, and episodic environments. This implementation uses stochastic policies. - TODO: could be a sub class of reinforces""" + TODO: could be a sub class of reinforce""" def __init__( self, n_actions: int, state_size: int, discount_factor: int = 0.99, - learning_rate=1e-3, + actor_hidden_layers: Optional[List[int]] = None, + critic_hidden_layers: Optional[List[int]] = None, + actor_learning_rate=1e-3, + critic_learning_rate=1e-3, ): + if not actor_hidden_layers: + actor_hidden_layers = [state_size * 2, state_size * 2] + if not critic_hidden_layers: + critic_hidden_layers = [state_size * 2, int(state_size / 2)] self.episode_count = 0 self.value_estimator = ValueEstimator( state_size, - [state_size * 2, int(state_size / 2)], + critic_hidden_layers, 1, - learning_rate=learning_rate, + learning_rate=critic_learning_rate, ) self.policy_estimator = PolicyEstimator( state_size, - [state_size * 2, state_size * 2], + actor_hidden_layers, n_actions, - learning_rate=learning_rate, + learning_rate=actor_learning_rate, ) self.discount_factor = discount_factor # starts the buffer diff --git a/pydeeprecsys/rl/agents/rainbow.py b/pydeeprecsys/rl/agents/rainbow.py index 71e557c..f7062cf 100644 --- a/pydeeprecsys/rl/agents/rainbow.py +++ b/pydeeprecsys/rl/agents/rainbow.py @@ -1,5 +1,5 @@ from numpy.random import RandomState -from typing import Any, Optional +from typing import Any, Optional, List from numpy import arange from copy import deepcopy from pydeeprecsys.rl.neural_networks.dueling import DuelingDDQN @@ -34,18 +34,20 @@ def __init__( batch_size: int = 32, noise_sigma: float = 0.017, discount_factor: float = 0.99, - learning_rate: float = 0.99, + learning_rate: float = 0.0001, + hidden_layers: List[int] = None, random_state: RandomState = RandomState(), statistics: Optional[LearningStatistics] = None, ): self.network = DuelingDDQN( - input_size, - output_size, - learning_rate, - noise_sigma, - discount_factor, + n_input=input_size, + n_output=output_size, + learning_rate=learning_rate, + noise_sigma=noise_sigma, + discount_factor=discount_factor, statistics=statistics, + hidden_layers=hidden_layers, ) self.target_network = deepcopy(self.network) diff --git a/pydeeprecsys/rl/agents/reinforce.py b/pydeeprecsys/rl/agents/reinforce.py index e5364f7..32e3f23 100644 --- a/pydeeprecsys/rl/agents/reinforce.py +++ b/pydeeprecsys/rl/agents/reinforce.py @@ -1,6 +1,6 @@ import numpy as np from pydeeprecsys.rl.agents.agent import ReinforcementLearning -from typing import Any, List +from typing import Any, List, Optional from pydeeprecsys.rl.experience_replay.experience_buffer import ExperienceReplayBuffer from pydeeprecsys.rl.experience_replay.buffer_parameters import ( ExperienceReplayBufferParameters, @@ -17,14 +17,16 @@ def __init__( self, n_actions: int, state_size: int, + hidden_layers: Optional[List[int]] = None, discount_factor: int = 0.99, # a.k.a gamma learning_rate=1e-3, ): self.episode_count = 0 - + if not hidden_layers: + hidden_layers = [state_size * 2, state_size * 2] self.policy_estimator = PolicyEstimator( state_size, - [state_size * 2, state_size * 2], + hidden_layers, n_actions, learning_rate=learning_rate, ) diff --git a/pydeeprecsys/rl/manager.py b/pydeeprecsys/rl/manager.py index 66bd910..0c26585 100644 --- a/pydeeprecsys/rl/manager.py +++ b/pydeeprecsys/rl/manager.py @@ -142,9 +142,9 @@ def hyperparameter_search( agent: type, params: dict, default_params: dict, - learning_statistics: LearningStatistics, episodes: int = 100, runs_per_combination: int = 3, + verbose: bool = True, ) -> dict: """Given an agent class, and a dictionary of hyperparameter names and values, will try all combinations, and return the mean reward of each combinatio @@ -154,15 +154,27 @@ def hyperparameter_search( if len(p_value) < 2: continue for value in p_value: - rl = agent(**{p_name: value, **default_params}) + rl = agent(**{**default_params, p_name: value}) + learning_statistics = LearningStatistics() combination_key = f"{p_name}={value}" for run in range(runs_per_combination): - print(f"Testing combination {p_name}={value} round {run}") - self.train(rl=rl, max_episodes=episodes, should_print=False) + self.train( + rl=rl, + max_episodes=episodes, + should_print=False, + statistics=learning_statistics, + ) combination_results[combination_key].append( - learning_statistics.moving_rewards[-1] + learning_statistics.moving_rewards.iloc[-1] ) - print(f"result was {learning_statistics.moving_rewards[-1]}") + if verbose: + print( + f"\rTested combination {p_name}={value} round {run} " + f"result was {learning_statistics.moving_rewards.iloc[-1]}" + "\t\t", + end="", + ) + return combination_results diff --git a/pydeeprecsys/rl/neural_networks/dueling.py b/pydeeprecsys/rl/neural_networks/dueling.py index a762b3b..83c913d 100644 --- a/pydeeprecsys/rl/neural_networks/dueling.py +++ b/pydeeprecsys/rl/neural_networks/dueling.py @@ -19,30 +19,37 @@ def __init__( n_input: int, n_output: int, learning_rate: float, + hidden_layers: List[int] = None, noise_sigma: float = 0.17, discount_factor: float = 0.99, statistics: Optional[LearningStatistics] = None, ): super().__init__() + if not hidden_layers: + hidden_layers = [256, 256, 64, 64] self.discount_factor = discount_factor - self._build_network(n_input, n_output, noise_sigma) + self._build_network(n_input, n_output, noise_sigma, hidden_layers=hidden_layers) self.optimizer = Adam(self.parameters(), lr=learning_rate) self.statistics = statistics - def _build_network(self, n_input: int, n_output: int, noise_sigma: float): + def _build_network( + self, n_input: int, n_output: int, noise_sigma: float, hidden_layers: List[int] + ): """Builds the dueling network with noisy layers, the value subnet and the advantage subnet. TODO: add `.to_device()` to Modules""" - self.fully_connected_1 = Linear(n_input, 256, bias=True) - self.fully_connected_2 = NoisyLayer(256, 256, bias=True, sigma=noise_sigma) + assert len(hidden_layers) == 4 + fc_1, fc_2, value_size, advantage_size = hidden_layers + self.fully_connected_1 = Linear(n_input, fc_1, bias=True) + self.fully_connected_2 = NoisyLayer(fc_1, fc_2, bias=True, sigma=noise_sigma) self.value_subnet = Sequential( - NoisyLayer(256, 64, bias=True, sigma=noise_sigma), + NoisyLayer(fc_2, value_size, bias=True, sigma=noise_sigma), ReLU(), - Linear(64, 1, bias=True), + Linear(value_size, 1, bias=True), ) self.advantage_subnet = Sequential( - NoisyLayer(256, 64, bias=True, sigma=noise_sigma), + NoisyLayer(fc_2, advantage_size, bias=True, sigma=noise_sigma), ReLU(), - Linear(64, n_output, bias=True), + Linear(advantage_size, n_output, bias=True), ) def forward(self, state): diff --git a/pydeeprecsys/tests/unit/test_actor_critic.py b/pydeeprecsys/tests/unit/test_actor_critic.py index 941fea6..397267d 100644 --- a/pydeeprecsys/tests/unit/test_actor_critic.py +++ b/pydeeprecsys/tests/unit/test_actor_critic.py @@ -11,7 +11,11 @@ def test_reinforce_init(): def test_reinforce_interaction(): manager = CartpoleManager() agent = ActorCriticAgent( - n_actions=2, state_size=4, discount_factor=0.95, learning_rate=0.001 + n_actions=2, + state_size=4, + discount_factor=0.95, + actor_learning_rate=0.001, + critic_learning_rate=0.001, ) learning_statistics = LearningStatistics() manager.train( diff --git a/pydeeprecsys/tests/unit/test_manager.py b/pydeeprecsys/tests/unit/test_manager.py index c11a900..e402531 100644 --- a/pydeeprecsys/tests/unit/test_manager.py +++ b/pydeeprecsys/tests/unit/test_manager.py @@ -1,6 +1,38 @@ from pydeeprecsys.rl.manager import MovieLensFairnessManager +from pydeeprecsys.rl.agents.reinforce import ReinforceAgent def test_movie_lens_manager(): manager = MovieLensFairnessManager() assert manager.env is not None + + +def test_hyperparameter_search(): + manager = MovieLensFairnessManager() + agent = ReinforceAgent + + default_params = { + "n_actions": manager.env.action_space.n, + "state_size": manager.env.observation_space.shape[0], + "hidden_layers": [64, 64], + "discount_factor": 0.95, + "learning_rate": 0.0001, + } + + optimize_params = { + "hidden_layers": [[64, 64], [128, 128], [256, 256]], + "discount_factor": [0.9, 0.95, 0.99], + "learning_rate": [0.00001, 0.0001, 0.001], + } + + results = manager.hyperparameter_search( + agent=agent, + runs_per_combination=2, + episodes=10, + params=optimize_params, + default_params=default_params, + ) + + assert results is not None + assert len(results.items()) == 9 + assert len(results["discount_factor=0.9"]) == 2