In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from multi_env import make_reversi_vec_env, SelfPlayEnv
import torch as th
from players import RandomPlayer
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy
import numpy as np

In [3]:
board_shape = 4
n_envs = 10
env = make_reversi_vec_env(
    SelfPlayEnv, n_envs=n_envs,
    env_kwargs={
        'board_shape': board_shape,
        'LocalPlayer': RandomPlayer
    }
)

# Modificación de librería para que haga argmax solo sobre las válidas

In [4]:
model = PPO(
    ActorCriticPolicy,
    env,
    verbose=0,
)

In [5]:
model.predict(env.reset())

(array([ 5, 15, 11, 12,  9, 15,  9, 11, 12,  8], dtype=int64), None)

# Custom ActorCriticPolicy 

https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/policies.py

In [6]:
from boardgame2 import ReversiEnv

In [7]:
env_not_vect = ReversiEnv(board_shape)

In [None]:
env_not_vect.get_valid((state, player))

In [10]:
def get_actions_mask(state):
    player = 1
    valid_actions = env_not_vect.get_valid((state, player))
    return valid_actions.reshape(-1)  


In [11]:
get_actions_mask(env.reset()[0][0])

array([0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0], dtype=int8)

In [12]:
class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(
        self,
        *args, # Todos los argumentos posicionales de ActorCriticPolicy
        actions_mask_func=None, # El nuevo argumento
        **kwargs # Todos los argumentos opcionales de ActorCriticPolicy
    ):
        super(CustomActorCriticPolicy, self).__init__(
            *args,
            **kwargs
        )
        if actions_mask_func:
            self.get_actions_mask = actions_mask_func
    
    #Devuelve las mascara con acciones validas
    def get_mask(obs):
        masks = np.zeros((len(obs), obs.shape[-1] * obs.shape[-2]))
        for i, board in enumerate(obs):
            board = board[0].cpu().numpy()
            masks[i] = 1 - self.get_actions_mask(board)
        return th.from_numpy(masks).to(self.device)    
    
    def sample_masked_actions(self, obs, distribution, deterministic=False, return_distribution=False):

        masks = get_mask(obs)
        masks[masks == 1] = -np.inf
        masked_logits = distribution.logits + masks
        if return_distribution:
            #Devuelve la distribucion nueva
            return th.distributions.Categorical(logits=masked_logits)
        if deterministic:
            #Devuelve la de mayor prboabilidad
            return th.argmax(masked_logits, axis=1)
        #Samplea la distribucion
        return th.distributions.Categorical(logits=masked_logits).sample()
    
    def _predict(self, observation, deterministic=False):
        """
        Get the action according to the policy for a given observation.
        :param observation:
        :param deterministic: Whether to use stochastic or deterministic actions
        :return: Taken action according to the policy
        """
        latent_pi, _, latent_sde = self._get_latent(observation)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
        
        if self.get_actions_mask:
            actions = self.sample_masked_actions(observation, distribution.distribution, deterministic=deterministic)
        else:
            actions = distribution.get_actions(deterministic=deterministic)
        
        return actions
    
    def forward(self, obs: th.Tensor, deterministic: bool = False):
        """
        Forward pass in all the networks (actor and critic)
        :param obs: Observation
        :param deterministic: Whether to sample or use deterministic actions
        :return: action, value and log probability of the action
        """
        latent_pi, latent_vf, latent_sde = self._get_latent(obs)
        # Evaluate the values for the given observations
        values = self.value_net(latent_vf)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde=latent_sde)
        
        
        if self.get_actions_mask:
            actions = self.sample_masked_actions(obs, distribution.distribution, deterministic=deterministic)
        else:
            actions = distribution.get_actions(deterministic=deterministic)

        log_prob = distribution.log_prob(actions)
        return actions, values, log_prob
    
    def evaluate_actions(self, obs: th.Tensor, actions: th.Tensor):
        """
        Evaluate actions according to the current policy,
        given the observations.
        :param obs:
        :param actions:
        :return: estimated value, log likelihood of taking those actions
            and entropy of the action distribution.
        """
        latent_pi, latent_vf, latent_sde = self._get_latent(obs)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
        distrib = self.sample_masked_actions(obs, distribution.distribution, return_distribution=True)

        log_prob = distrib.log_prob(actions)
        values = self.value_net(latent_vf)
        return values, log_prob, distrib.entropy()

In [13]:
model = PPO(
    CustomActorCriticPolicy,
    env,
    verbose=0,
    policy_kwargs = {'actions_mask_func': get_actions_mask}
)

In [14]:
# Testeo de predict
model.policy.get_actions_mask(env.reset()[0][0])

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0], dtype=int8)

In [15]:
obs = env.reset()

In [16]:
actions, _ = model.predict(obs)

In [17]:
# Verificar que las acciones son válidas
actions

array([ 2,  2,  4,  3, 14,  7, 11,  1, 14,  2], dtype=int64)

In [18]:
# Testeo de forward
model.policy(th.from_numpy(obs).to(model.device))

(tensor([13, 13,  4, 11, 14,  2,  1,  1, 12,  8]),
 tensor([[ 0.4753],
         [ 0.4753],
         [ 0.2941],
         [-0.2490],
         [ 0.2941],
         [ 0.4753],
         [-0.3658],
         [-0.2490],
         [ 0.5616],
         [ 0.4753]], grad_fn=<AddmmBackward0>),
 tensor([-2.7672, -2.7672, -2.7714, -2.7734, -2.7667, -2.7703, -2.7669, -2.7729,
         -2.7731, -2.7737], grad_fn=<SqueezeBackward1>))

# Corremos PPO

In [19]:
board_shape = 4
n_envs = 6
gamma = 0.99
ent_coef = 0.0
gae_lambda = 0.95
n_epochs = 10

In [20]:
prefix = 'Reversi_PPO'
suffix = 'masked_actions'
model_name = f'{prefix}_{board_shape}by{board_shape}_{gamma}_{gae_lambda}_{ent_coef}_{n_epochs}_{n_envs}_{suffix}'
best_model_save_path = f'./models/{model_name}'
print(model_name)
print(best_model_save_path)

Reversi_PPO_4by4_0.99_0.95_0.0_10_6_masked_actions
./models/Reversi_PPO_4by4_0.99_0.95_0.0_10_6_masked_actions


In [21]:
model = PPO(
    CustomActorCriticPolicy,
    env,
    verbose=0,
    tensorboard_log='tensorboard_log',
    gamma=gamma,
    gae_lambda=gae_lambda,
    ent_coef=ent_coef,
    n_epochs=n_epochs,
    policy_kwargs = {'actions_mask_func': get_actions_mask}
)

In [22]:
from stable_baselines3.common.callbacks import EvalCallback

In [23]:
# El entorno de evaluación no corre en paralelo por eso uno solo
eval_env = make_reversi_vec_env(
    SelfPlayEnv, n_envs=1,
    env_kwargs={
        'board_shape': board_shape,
        'LocalPlayer': RandomPlayer
    }
)

In [24]:
eval_callback = EvalCallback(
    eval_env = eval_env,
    eval_freq=1_000,
    n_eval_episodes=500,
    deterministic=True,
    verbose=1,
    best_model_save_path=best_model_save_path,
) 

In [25]:
model.learn(total_timesteps=int(1e10), callback=[eval_callback])

Eval num_timesteps=10000, episode_reward=-0.15 +/- 0.94
Episode length: 5.76 +/- 0.73
New best mean reward!
Eval num_timesteps=20000, episode_reward=-0.00 +/- 0.96
Episode length: 5.84 +/- 0.69
New best mean reward!
Eval num_timesteps=30000, episode_reward=0.54 +/- 0.83
Episode length: 6.11 +/- 0.58
New best mean reward!
Eval num_timesteps=40000, episode_reward=0.53 +/- 0.83
Episode length: 6.12 +/- 0.59
Eval num_timesteps=50000, episode_reward=0.54 +/- 0.83
Episode length: 6.32 +/- 0.70
Eval num_timesteps=60000, episode_reward=0.57 +/- 0.82
Episode length: 6.30 +/- 0.64
New best mean reward!
Eval num_timesteps=70000, episode_reward=0.35 +/- 0.89
Episode length: 6.03 +/- 0.54
Eval num_timesteps=80000, episode_reward=0.53 +/- 0.82
Episode length: 6.03 +/- 0.55
Eval num_timesteps=90000, episode_reward=0.63 +/- 0.72
Episode length: 6.21 +/- 0.52
New best mean reward!
Eval num_timesteps=100000, episode_reward=0.61 +/- 0.73
Episode length: 6.18 +/- 0.49
Eval num_timesteps=110000, episode_re

KeyboardInterrupt: 

El modelo se guarda en la carpeta 'models'

Hay un modelo para tablero 8x8 y uno para tablero 4x4. El de 4 es el que se pone a prueba contra los otros jugadores