# RL Framework for HPO

In [171]:
from typing import Self, Optional, Iterable
from abc import ABC, abstractclassmethod
from collections import deque, namedtuple
from dataclasses import field
from dataclasses import dataclass, make_dataclass
import logging
import numpy as np
import pandas as pd

from sklearn.utils import Bunch
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import torch
from torch import Tensor
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F
from torch.distributions import Distribution, Normal
from torch.optim import Optimizer, Adam

In [42]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)

## Dataset

In [43]:
iris = load_iris(as_frame=True)
iris_df = iris.frame
X = iris.data.to_numpy()
y = iris.target.to_numpy()

iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [44]:
def train_valid_test_split(
        X: np.ndarray,
        y: np.ndarray,
        valid_size: float = 0.2,
        test_size: float = 0.2,
        random_state: Optional[int] = None,
        shuffle: bool = True
    ) -> dict:
    
    # Calculate size of training dataset
    valid_test_size = valid_size + test_size
    train_size = 1 - valid_test_size
    assert train_size > 0 and train_size <= 1,\
        "size of training dataset must be >0 and <=1"
        
    # There is only the training dataset
    if valid_test_size == 0:
        X_valid = None
        y_valid = None
        X_test = None
        y_test = None
    
    else:
        # Split the training dataset and the remaining dataset
        X_train, X_valid_test, y_train, y_valid_test = train_test_split(
            X, y,
            train_size=train_size,
            random_state=random_state,
            shuffle=shuffle
        )
        
        assert valid_size >= 0 and valid_size <= 1,\
            "size of validation dataset must be >=0 and <=1"
            
        if valid_size > 0:
            # There is no test dataset
            if test_size == 0:
                X_valid = X_valid_test
                y_valid = y_valid_test
                X_test = None
                y_test = None
                
            # Split the validation dataset and test dataset
            else:
                X_valid, X_test, y_valid, y_test = train_test_split(
                    X_valid_test, y_valid_test,
                    train_size=valid_size / valid_test_size,
                    random_state=random_state,
                    shuffle=shuffle
                )
            
        # There is no validation dataset
        else:
            X_valid = None
            y_valid = None
            X_test = X_valid_test
            y_test = y_valid_test
    
    return {
        "train": (X_train, y_train),
        "valid": (X_valid, y_valid),
        "test": (X_test, y_test)
    }


In [45]:
@dataclass
class Bounds:
    min: float | int
    max: float | int
    
bounds = Bounds(min=0.1, max=1)
type(bounds.max)

int

In [125]:
@dataclass
class HpConfig(ABC):

    @classmethod
    def param_names(cls) -> tuple[str]:
        """Hyperparameter names.
        """
        
        return tuple(cls.__dataclass_fields__.keys())
    
    @classmethod
    def param_type(cls, name: str) -> type:
        """Data type of the hyperparameter.
        """
        
        return cls.__dataclass_fields__.get(name).type
    
    @classmethod
    def dim(cls) -> int:
        """Dimension of the hyperparameter space, i.e.,
        the number of hyperparameters.
        """
        
        return len(cls.param_names())
    
    @classmethod
    def from_action(
            cls, 
            action: Iterable[float],
            bounds: dict[str, tuple]
        ) -> Self:
        
        hps = {}
        for i, hp in enumerate(action):
            hp_name = cls.param_names()[i]
            hp_type = cls.param_type(hp_name)
            hp_min, hp_max = bounds[hp_name]         
            hp = hp_type(hp * (hp_max - hp_min) + hp_min)
            hps[hp_name] = hp
            
        return cls(**hps)        
    
    def to_dict(self) -> dict:
        
        return self.__dict__


In [192]:
def hp(cls):
    
    cls = make_dataclass(
        cls.__name__,
        cls.__annotations__.items(),
        bases=(HpConfig, object)
    )    
    
    return cls


In [196]:

# @dataclass
# class SVCConfig(HpConfig):
    
#     C: float
#     gamma: float
#     tol: float

# @make_hp_config
# @dataclass
@hp
class SVCConfig:
    
    C: float
    gamma: float
    tol: float

SVCConfig(C=0.1, gamma=0.1, tol=0.01).dim()

3

In [191]:
A = make_dataclass(
    "A", 
    fields=(
        ("C", float),
        ("tol", float, field()),
    ),
    bases=(HpConfig, object)
)



In [47]:
class Env:
    
    def __init__(
            self, 
            dataset: Bunch,
            *,
            valid_size: float = 0.2,
            test_size: float = 0.2,
            state_dim: int = 10,
            random_state: Optional[int] = None,
        ) -> None:
        
        # Data
        X = dataset.data
        y = dataset.target
        
        # Random state
        self._random_state = random_state
        
        # Split into training, validation and test datasets
        split = train_valid_test_split(
            X, y,
            valid_size=valid_size,
            test_size=test_size,
            random_state=random_state
        )
        self._X_train, self._y_train = split["train"]
        self._X_valid, self._y_valid = split["valid"]
        self._X_test, self._y_test = split["test"]
        
        # State dimention
        self._state_dim = state_dim
        
        # A buffer of actions taken
        self._actions_taken = deque(maxlen=state_dim)
        
        # Reset env
        self._init_state = self.reset()
    
    @property
    def state_dim(self) -> int:
        """State dimension.
        """
        
        return self._state_dim
    
    @property
    def init_state(self) -> np.ndarray:
        """Initial state.
        """
        
        return self._init_state
    
    def reset(self) -> np.ndarray:
        
        # Sigmoid function
        sigmoid = lambda x: 1 / (1 + np.exp(-x))
        
        # NumPy's random generator
        rng = np.random.RandomState(seed=self._random_state)
        
        # Generate random actions
        random_actions = [
            sigmoid(rng.randn(SVCConfig.dim())) 
            for _ in range(self._state_dim)
        ]
        
        # Reset actions taken
        self._actions_taken.clear()
        self._actions_taken.extend(random_actions)
        
        # Create an initial state
        init_state = np.array(self._actions_taken)
        self._init_state = init_state
        
        return init_state
        
    
    def step(self, action: Iterable[float]) -> tuple[np.ndarray, float]:
        
        # Generate the next state
        self._actions_taken.append(action)
        state = np.array(self._actions_taken)
        
        # Create the HP configuation from the action taken
        hp_config = SVCConfig.from_action(
            action,
            bounds={
                "C": (0.1, 1.0),
                "gamma": (0.001, 0.1),
                "tol": (0.001, 0.1)
            }
        )
        
        # Create the model with selected HP config
        svc = SVC(**hp_config.to_dict())
        
        # Train the model
        svc.fit(self._X_train, self._y_train)
        
        # Compute the accuracy on validation dataset
        y_pred = svc.predict(self._X_valid)
        reward = accuracy_score(self._y_valid, y_pred)
        
        return state, reward


In [79]:
hp_config = SVCConfig(
    C=0.1,
    gamma=0.1,
    tol=0.01
)

hp_config

make_hp_config.<locals>._cls(C=0.1, gamma=0.1, tol=0.01)

In [None]:
SVCConfig

In [None]:
def make_hp_config(cls):
    
    class _cls(cls, HpConfig):
        pass
    
    return dataclass(_cls)



In [59]:
isinstance(hp_config, HpConfig)

True

In [55]:
env = Env(
    iris, 
    test_size=0.3,
    random_state=42
)

In [9]:
class Agent(nn.Module):
    
    def __init__(
            self,
            action_dim: int
        ) -> None:
        
        super().__init__()
        
        self._action_dim = action_dim
        self._action = None
        self._distribution = None
        
        self._n_dist_params = 2
        
        self.fc = nn.Linear(action_dim, 64)
        
        self.lstm_cell = nn.LSTMCell(
            input_size=64,
            hidden_size=128
        )
        
        self.lstm = nn.LSTM(
            input_size=64,
            hidden_size=128,
            num_layers=1,
            batch_first=True
        )
        
        self.dist_param1 = nn.Sequential(
            nn.Linear(128, action_dim),
            nn.Sigmoid()
        )
        
        self.dist_param2 = nn.Sequential(
            nn.Linear(128, action_dim),
            nn.Sigmoid()
        )
    
    @property
    def distribution(self) -> Distribution:
        """Distribution to generate the action.
        """
        
        return self._distribution
    
    @property
    def action(self) -> Tensor:
        """Action taken.
        """
        
        return self._action
        
    def forward(self, state: Tensor) -> Distribution:
        """_summary_

        Parameters
        ----------
        state : Tensor
            (state_dim, action_dim)
            (N, state_dim, action_dim)

        Returns
        -------
        Distribution
            _description_
        """
        
        x = self.fc(state)
        x, (h, c) = self.lstm(x)
        x = x[..., -1, :]
        
        # Extract parameters for the distribution
        mean = self.dist_param1(x)
        std = self.dist_param2(x)

        # Generate the distributino
        distribution = Normal(mean, std)
        
        # Store the distribution
        self._distribution = distribution 
    
        return distribution
    
    def select_action(self, state: Optional[Tensor]) -> Tensor:
        
        # Genereate a new distribution
        # if the state is provided
        if state is not None:
            self.forward(state)
        
        # Select an action randomly from the distribution
        action = self._distribution.sample()
        
        # Clip the action by upper and lower bounds
        # More specifically, each entry of the vector must in between 0 and 1
        action = action.clip(0, 1)
        
        # Store the selected action
        self._action = action
        
        return action
    
    def log_prob(
            self, 
            action: Optional[Tensor] = None, 
            state: Optional[Tensor] = None
        ) -> Tensor:
        """Compute the log-probability of the action taken.

        Parameters
        ----------
        action : Optional[Tensor], optional
            _description_, by default None
        state : Optional[Tensor], optional
            _description_, by default None

        Returns
        -------
        Tensor
            _description_
        """
        
        if action is None:
            assert state is None,\
                "state must be set None since action is None"
            action = self._action
        
        if state is not None:
            self.forward(state)
        
        log_prob = self._distribution.log_prob(action).sum(dim=-1)
        
        return log_prob


In [10]:
agent = Agent(
    action_dim=SVCConfig.dim()
)

state = torch.rand(10, 3)
action = agent.select_action(state)
log_prob = agent.log_prob()

print(f"state shape: {state.shape}")
print(f"action shape: {action.shape}")
print(f"log-probability shape: {log_prob.shape}")

state shape: torch.Size([10, 3])
action shape: torch.Size([3])
log-probability shape: torch.Size([])


In [11]:
def sma(x: np.ndarray, period: int = 5) -> np.ndarray:
    
    assert len(x) >= period,\
        "the length of the array must be at least the length of the period"
    
    sma = []
    for t in range(period - 1, len(x)):
        sma.append(x[t-period+1:t].mean(axis=0))

    return np.array(sma)

def ema(
        x: np.ndarray, 
        period: int = 5,
        alpha: float = 0.2
    ) -> np.ndarray:
    
    assert len(x) >= period,\
        "the length of the array must be at least the length of the period"
    
    ema = []
    ema.append(x[:period].mean(axis=0))
    for i, t in enumerate(range(period, len(x))):
        ema.append(
            x[t] * alpha + ema[i] * (1 - alpha)
        )

    return np.array(ema)


In [12]:
ema(np.arange(10), period=2).shape

(9,)

In [13]:
Transition = namedtuple(
    "Transition",
    (
        "state",
        "action",
        "reward",
        "advantage",
        "log_prob"
    )
)

def convert_to_transition_with_fields_as_lists(transitions: list[Transition]) -> Transition:
    
    return Transition(*map(list, zip(*transitions)))
    
def convert_to_transitions(transition_with_fields_as_list: Transition) -> Transition:
    
    return list(map(
        lambda fields: Transition(*fields), 
        zip(*transition_with_fields_as_list)
    ))


In [14]:
def play_one_episode(
        env: Env,
        agent: Agent,
        max_n_timesteps_per_episode: int,
        warm_start_duration: int
    ) -> list[Transition]:
    
    states = []
    actions = []
    rewards = []
    log_probs = []

    state = env.reset()
    is_done = False
    
    for t in range(max_n_timesteps_per_episode):
        
        # Select an action
        state = torch.tensor(state, dtype=torch.float)
        action = agent.select_action(state)
        action = action.detach().numpy()
        
        # Compute the log-probability of the action taken
        log_prob = agent.log_prob()
        log_prob = log_prob.detach().item()
        
        # Interact with the env
        next_state, reward = env.step(action)
        
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        log_probs.append(log_prob)
        
        # Step to the next state
        state = next_state
        
    # Compute baselines using moving average technique
    baselines = ema(np.array(rewards), period=warm_start_duration + 1)
        
    # Drop the data in the warm start
    states = states[warm_start_duration:]
    actions = actions[warm_start_duration:]
    rewards = rewards[warm_start_duration:]
    log_probs = log_probs[warm_start_duration:]
    
    # Compute advantages
    advantages = rewards - baselines
    
    transition_with_fields_as_list = Transition(
        state=states,
        action=actions,
        reward=rewards,
        advantage=advantages,
        log_prob=log_probs
    )
    
    # Convert to list of transitions
    transitions = convert_to_transitions(transition_with_fields_as_list)
    
    return transitions


In [15]:
class ReplayBuffer(deque, Dataset):
    
    def __init__(
            self,
            env: Env,
            capacity: int,
            max_n_timesteps_per_episode: int,
            warm_start_duration: int
        ) -> None:
        
        super().__init__(maxlen=capacity)

        self._env = env
        self._capacity = capacity
        self._max_n_timesteps_per_episode = max_n_timesteps_per_episode
        self._warm_start_duration = warm_start_duration
    
    @property
    def capacity(self) -> int:
        return self._capacity
    
    @property
    def max_n_timesteps_per_episode(self) -> int:
        return self._max_n_timesteps_per_episode
        
    def collect(self, agent: Agent) -> None:
        
        while len(self) < self._capacity:
            
            # Collect transitions by interacting with the env
            transitions = play_one_episode(
                env=self._env,
                agent=agent,
                max_n_timesteps_per_episode=self._max_n_timesteps_per_episode,
                warm_start_duration=self._warm_start_duration
            )
            
            # Add to the buffer
            self.extend(transitions)


In [16]:
def ppo_loss(
        *,
        curr_log_prob: Tensor,
        old_log_prob: Tensor,
        advantage: Tensor,
        epsilon: float = 0.2
    ) -> Tensor:
    
    ratio = torch.exp(curr_log_prob - old_log_prob)
    
    surr1 = ratio * advantage
    surr2 = torch.clip(
        ratio,
        1 - epsilon,
        1 + epsilon
    ) * advantage
    
    loss = -torch.min(surr1, surr2).mean()
    
    return loss


In [17]:
def update_agent(
        agent: Agent,
        optimizer: Optimizer,
        replay_buffer_loader: DataLoader,
        n_epochs: int,
        epsilon: float = 0.2
    ):
    
    for _ in range(n_epochs):
        transition: Transition
        for transition in replay_buffer_loader:
            
            # Compute PPO loss
            loss = ppo_loss(
                curr_log_prob=agent.log_prob(
                    transition.action,
                    transition.state
                ),
                old_log_prob=transition.log_prob,
                advantage=transition.advantage,
                epsilon=epsilon
            )
            
            # Update the agent
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


In [18]:

def train(
        agent: Agent,
        optimizer: Optimizer,
        n_epochs: int,
        replay_buffer: ReplayBuffer,
        batch_size: int,
        n_epochs_for_updating_agent: int,
        epsilon: float
    ):
        
    for epoch in range(n_epochs):
        
        logging.info(f"PPO epoch: {epoch + 1}")
        avg_episode_rewards = []
        
        # Collect transitions
        replay_buffer.collect(agent)
        
        rewards = []
        for transition in replay_buffer:
            rewards.append(transition.reward)
        avg_episode_reward = np.mean(rewards)
        avg_episode_rewards.append(avg_episode_reward)
        logging.info(f"average episode rewards: {avg_episode_reward}")
        
        # Create a data loader
        replay_buffer_loader = DataLoader(
            replay_buffer,
            batch_size=batch_size
        )
        
        # Train the actor and critic   
        update_agent(
            agent=agent,
            optimizer=optimizer,
            replay_buffer_loader=replay_buffer_loader,
            n_epochs=n_epochs_for_updating_agent,
            epsilon=epsilon
        )
        
        # Clear replay buffer
        replay_buffer.clear()


In [19]:
agent = Agent(
    action_dim=SVCConfig.dim()
)

optimizer = Adam(
    agent.parameters(), 
    lr=0.001
)

replay_buffer = ReplayBuffer(
    env,
    capacity=500,
    max_n_timesteps_per_episode=20,
    warm_start_duration=5
)

In [20]:
train(
    agent=agent,
    optimizer=optimizer,
    n_epochs=10,
    replay_buffer=replay_buffer,
    batch_size=8,
    n_epochs_for_updating_agent=5,
    epsilon=0.2
)

2023-09-25 16:16:18,301 | INFO | PPO epoch: 1
2023-09-25 16:16:19,716 | INFO | average episode rewards: 0.8919333333333334
2023-09-25 16:16:20,691 | INFO | PPO epoch: 2
2023-09-25 16:16:21,951 | INFO | average episode rewards: 0.9696
2023-09-25 16:16:22,887 | INFO | PPO epoch: 3
2023-09-25 16:16:24,097 | INFO | average episode rewards: 0.9902000000000001
2023-09-25 16:16:25,043 | INFO | PPO epoch: 4
2023-09-25 16:16:26,252 | INFO | average episode rewards: 0.9947999999999999
2023-09-25 16:16:27,208 | INFO | PPO epoch: 5
2023-09-25 16:16:28,409 | INFO | average episode rewards: 0.9954666666666667
2023-09-25 16:16:29,346 | INFO | PPO epoch: 6
2023-09-25 16:16:30,544 | INFO | average episode rewards: 0.9990666666666668
2023-09-25 16:16:31,485 | INFO | PPO epoch: 7
2023-09-25 16:16:32,675 | INFO | average episode rewards: 0.9984666666666667
2023-09-25 16:16:33,620 | INFO | PPO epoch: 8
2023-09-25 16:16:34,807 | INFO | average episode rewards: 1.0
2023-09-25 16:16:35,746 | INFO | PPO epoch:

In [21]:
agent.distribution

Normal(loc: torch.Size([4, 3]), scale: torch.Size([4, 3]))

In [None]:
agent.distribution.sample()

In [22]:
agent.action

tensor([0.7295, 0.6731, 0.3120])

In [23]:
agent.distribution

Normal(loc: torch.Size([4, 3]), scale: torch.Size([4, 3]))

In [26]:
hp_config = SVCConfig.from_action(
    action=agent.action.detach().numpy(),
    bounds={
        "C": (0.1, 1.0),
        "gamma": (0.001, 0.1),
        "tol": (0.001, 0.1)
    }
)

hp_config

SVCConfig(C=0.7565273582935333, gamma=0.06763471615314484, tol=0.03189250004291534)

In [33]:
svc = SVC(**hp_config.to_dict())

svc.fit(
    env._X_train,
    env._y_train
)

accuracy_score(
    env._y_test,
    svc.predict(env._X_test)
)

1.0

In [56]:
svc = SVC()

svc.fit(
    env._X_train,
    env._y_train
)

accuracy_score(
    env._y_test,
    svc.predict(env._X_test)
)

1.0