# Crear un TorchPlayer


Recibe el modelo a instanciar como path y juega con el mismo

- Pensar como resolver el problema de que solo samplee las válidas
- Agregarle la opción de monte carlo tree search (opcional) con las opciones de iterationLimit, timeLimit

Si va a agregar MCTS mirar la notebook 007_MCTS.ipnb

In [1]:
from stable_baselines3 import PPO
from boardgame2 import ReversiEnv
import numpy as np

In [2]:
class TorchPlayer(): 
    def __init__(self, model_path, player=1, board_shape=None, env=None, deterministic=True, only_valid=True, mcts=False, iterationLimit=None, timeLimit=None, flatten_action=False):
        self.model = PPO.load(model_path)
        if (env is None) and (board_shape is None):
            print("board_shape and env can't be both None")
        if env is None:
            env = ReversiEnv(board_shape=board_shape)
            
        env.reset()
        self.env = env
        self.player = player
        self.board_shape = self.env.board.shape[0]
        
    def predict(self, board):
        
        input_board = board * self.player
        array_input = [[np.array(input_board)]]
        action = self.model.predict(array_input)[0][0]
        return [action // self.board_shape, action % self.board_shape]

# Arena

Testear el jugador contra los distintos jugadores

In [3]:
def arena_stats(player_1, player_2, board_shape, N=500):
    
    env = ReversiEnv(board_shape=board_shape)
    wins_as_first = 0
    wins_as_second = 0
    plays_as_first = 0
    plays_as_second = 0
    total_steps = 0

    for i in range(N):
        # Aveces empieza un jugador, a veces el otro
        first_player = np.random.choice([-1, 1])
        player_1.player = first_player
        player_2.player = -first_player
        
        plays_as_first = plays_as_first + (first_player == 1)
        plays_as_second = plays_as_second + (first_player == -1)
        
        done = False
        n_steps = 0
        (board, player) = env.reset()
        
        while not done:
            if first_player == player:
                action = player_1.predict(board)
            else:
                action = player_2.predict(board)
            (board, player), reward, done, info = env.step(action)
            n_steps = n_steps + 1
        total_steps = total_steps + n_steps
        wins_as_first = wins_as_first + (reward == first_player) * (first_player == 1)
        wins_as_second = wins_as_second + (reward == first_player) * (first_player == -1)
    print(f'Wins as first: {wins_as_first/plays_as_first}')
    print(f'Wins as second: {wins_as_second/plays_as_second}')
    print(f'Plays as first: {plays_as_first}')
    print(f'Plays as second: {plays_as_second}')
    print(f'Avg game duration: {total_steps/N}')
        

### Instanciamos los jugadores que se van a pasar al arena stats

In [4]:
from players import RandomPlayer, GreedyPlayer, DictPolicyPlayer

In [5]:
torchPlayer = TorchPlayer('models/Reversi_PPO_4by4_0.99_0.95_0.0_10_6_masked_actions/best_model.zip', player=1, board_shape=4)
#player1 = TorchPlayer('models/Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions/best_model.zip', player=1, board_shape=8)

In [6]:
greedyPlayer = GreedyPlayer(player=-1, board_shape=4, flatten_action=False)

In [7]:
randomPlayer = RandomPlayer(player=-1, board_shape=4, flatten_action=False)

In [8]:
dictPolicyPlayer = DictPolicyPlayer(player=-1, board_shape=4, flatten_action=False)

In [7]:
#env_test = player1.env
#(first_input_board, plf) = env_test.reset()
#action = player1.predict(first_input_board)
#(next_input_board, pl) , _, _, _ = env_test.step(action)
#player1.predict(next_input_board)

[0, 3]

### Los hacemos jugar

In [9]:
arena_stats(torchPlayer, randomPlayer, 4, N=2000)

Wins as first: 0.6359223300970874
Wins as second: 0.9391752577319588
Plays as first: 1030
Plays as second: 970
Avg game duration: 11.87


In [10]:
arena_stats(torchPlayer, greedyPlayer, 4, N=2000)

Wins as first: 0.6270161290322581
Wins as second: 0.9345238095238095
Plays as first: 992
Plays as second: 1008
Avg game duration: 11.883


In [11]:
arena_stats(torchPlayer, dictPolicyPlayer, 4, N=2000)

Wins as first: 0.0
Wins as second: 0.0
Plays as first: 992
Plays as second: 1008
Avg game duration: 11.9905
