<a href="https://colab.research.google.com/github/natanrajch/DiploDatos/blob/main/aprendizaje_refuerzos/TicTacToe_RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Instalaciones

In [2]:
!pip install stable-baselines3[extra,tests,docs]



In [1]:
#@title Instalación de RLBaselinesZoo (no modificar)

# Estamos en Colab?

try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
    !git clone --recursive https://github.com/DLR-RM/rl-baselines3-zoo
    !cd rl-baselines3-zoo/
    !apt-get install swig cmake ffmpeg
    !pip install -r /content/rl-baselines3-zoo/requirements.txt

Cloning into 'rl-baselines3-zoo'...
remote: Enumerating objects: 3210, done.[K
remote: Counting objects: 100% (270/270), done.[K
remote: Compressing objects: 100% (177/177), done.[K
remote: Total 3210 (delta 170), reused 162 (delta 86), pack-reused 2940[K
Receiving objects: 100% (3210/3210), 2.15 MiB | 19.81 MiB/s, done.
Resolving deltas: 100% (2098/2098), done.
Submodule 'rl-trained-agents' (https://github.com/DLR-RM/rl-trained-agents) registered for path 'rl-trained-agents'
Cloning into '/content/rl-baselines3-zoo/rl-trained-agents'...
remote: Enumerating objects: 1706, done.        
remote: Counting objects: 100% (291/291), done.        
remote: Compressing objects: 100% (202/202), done.        
remote: Total 1706 (delta 96), reused 278 (delta 89), pack-reused 1415        
Receiving objects: 100% (1706/1706), 1.17 GiB | 33.43 MiB/s, done.
Resolving deltas: 100% (316/316), done.
Submodule path 'rl-trained-agents': checked out '3dd2af4cee930750016cf943dc6393bada57b89c'
Reading pac

In [3]:
import os
from subprocess import Popen, PIPE

import numpy as np
import matplotlib.pyplot as plt
import random

import gym
from gym import spaces
from gym.spaces import Discrete, Box


from stable_baselines3 import DQN, PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
from stable_baselines3.common.env_util import make_vec_env

### Custom Environment de ta te tí

Se realiza la implementación de un Custom Environment para aprender a jugar Ta Te Tí.
El trabajo toma la estructura, y la función de renderizado (render) de https://github.com/francofgp/Tic-Tac-Toe-Gym

Sin embargo se ha modificado completamente las funciones __init__ y step. Además se ha programado un oponente con cierta inteligencia contra el que juega nuestro robot, de modo de conseguir una IA mejor.

En esta implementación, hemos probado con un action_space que se recalcula según el estado del juego (el número de acciones es variable según avanza la partida).

Tras 500.000 iteraciones, nuestro robot alcanza un reward que implica que está en una relación ganar/empatar de 65/35. Esto se chequea al final con el modelo terminado, aprovechando el dict info del environment.


In [14]:
class TicTacToe(gym.Env):
  """
  Ambiente personalizado que sigue la interfaz de gym.
  Es un entorno simple en el cuál el agente debe aprender ganarle a un adversario simple
  """
  metadata = {'render.modes': ['console']}

  def __init__(self, grid_size=10):
    super(TicTacToe, self).__init__()
    
    self.observation_space = Box(low=np.zeros(9), high=np.full((9, ), 3))
    self.state = [0,0,0,0,0,0,0,0,0]
    self.win_combs = [[0,1,2],[3,4,5],[6,7,8], #filas
                       [0,3,6],[1,4,7],[2,5,8], #cols
                       [0,4,8],[2,4,6]] #diags
  
  def step(self, action):
    reward = 0
    pos = 0
    zpos = 0
    info = {}
    #Un valor de action=3 implica poner una Cruz en el tercer lugar vacío (No en el 3er lugar de la grilla, sino el 3ero de los vacíos)
    #Esto se ha planteado así para poder aprovechar la función Discrete, teniendo en cuenta que el espacio de acciones varía step a step.
    for st in self.state: 
      if st == 0:
        if zpos == action:
          self.state[pos] = 1
        zpos += 1
      pos += 1  
    if self.check_win():
      reward += 1000
      done = True
      info['resultado'] = 'Victoria'
    elif self.check_draw():
      reward = reward -100
      done=True
      info['resultado'] = 'Empate'
    else:
      self.player2() #Función con IA del adversario
      if self.check_lose():
        reward -= 1000
        done = True
        info['resultado'] = 'Derrota'
      elif self.check_draw():
        reward = reward -100
        done=True
        info['resultado'] = 'Empate'
      else:
        done = False  
    
    return np.array([self.state]).astype(np.float32), reward, done, info
    


  def player2(self):
    '''Si puede formar un ta-te-tí, lo hace, luego si puede tapar una línea del Player 1 la genera, si no, juega random '''
    p2_played = False
    if self.state.count(0) > 0:
      for comb in self.win_combs:
        for skip in [0,1,2]:
          if self.state[comb[skip]]==0 and self.state[comb[[X for X in [0,1,2] if X != skip][0]]]==2 and self.state[comb[[X for X in [0,1,2] if X != skip][1]]]==2:   
            self.state[comb[skip]]=2           
            p2_played = True
            break
        else:
          continue
        break
      
      if not p2_played:
        for comb in self.win_combs:
          for skip in [0,1,2]:
            if self.state[comb[skip]]==0 and self.state[comb[[X for X in [0,1,2] if X != skip][0]]]==1 and self.state[comb[[X for X in [0,1,2] if X != skip][1]]]==1:   
              self.state[comb[skip]]=2           
              p2_played = True
              break
          else:
            continue
          break
        
      if not p2_played:
        random_action = random.randint(0,self.state.count(0)-1)
        pos = 0
        zpos = 0
        for st in self.state:
          if st == 0:
            if zpos == random_action:
              self.state[pos] = 2
            zpos += 1
          pos += 1  
  
  def check_win(self):
    for comb in self.win_combs:
      if self.state[comb[0]]==1 and self.state[comb[1]]==1 and self.state[comb[2]]==1:
        return True
    return False

  def check_lose(self):
    for comb in self.win_combs:
      if self.state[comb[0]]==2 and self.state[comb[1]]==2 and self.state[comb[2]]==2:
        return True
    return False
    
  def check_draw(self):
    if self.state.count(0) > 0:
      return False
    else:
      return True
  
  def reset(self):
    self.state=[0,0,0,0,0,0,0,0,0]

    return np.array([self.state]).astype(np.float32)

  def render(self, mode='console'):
    if mode != 'console':
      raise NotImplementedError()
    # en nuestra interfaz de consola, representamos el agente como una cruz, y 
    # el resto como un punto
    draw = ["-", "X", "O"]
    drawing = [draw[st] for st in self.state]
    for i in range(0,9,3):
      if i==0:
        print(f"{drawing[i]} {drawing[i+1]} {drawing[i+2]} ")
      if i==3:
        print(f"{drawing[i]} {drawing[i+1]} {drawing[i+2]} ")
      if i==6:
        print(f"{drawing[i]} {drawing[i+1]} {drawing[i+2]} ")
  def close(self):
    pass

  @property
  def action_space(self):
    return Discrete(self.state.count(0))

In [19]:
env = TicTacToe()
env = make_vec_env(lambda: env, n_envs=1)
model = PPO('MlpPolicy', env, verbose=0).learn(1000_000)



Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 3.92     |
|    ep_rew_mean     | -964     |
| time/              |          |
|    fps             | 1911     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 3.8          |
|    ep_rew_mean          | -931         |
| time/                   |              |
|    fps                  | 1461         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0036309846 |
|    clip_fraction        | 0.00986      |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.2         |
|    explained_variance   | 5.52e-05     

In [6]:
model.save("TicTacToe_1111_2018")

### Resultados
Porcentaje de victorias y empates:

In [21]:
victorias = 0
empates = 0
derrotas = 0

obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    if dones:
      obs = env.reset()
      if info[0]['resultado'] == 'Victoria':
        victorias += 1
      if info[0]['resultado'] == 'Empate':
        empates += 1
      if info[0]['resultado'] == 'Derrota':
        derrotas += 1
print('Victorias = ', victorias/(victorias+empates+derrotas))     
print('Empates = ', empates/(victorias+empates+derrotas))     
print('Derrotas = ', derrotas/(victorias+empates+derrotas))     
      

Victorias =  0.5956521739130435
Empates =  0.34347826086956523
Derrotas =  0.06086956521739131


Se imprime a continuación un ejemplo del robot jugando unas 20/25 partidas.
En cada step se observa al inicio la acción tomada por el robot (recordar que la acción señala la posición del lugar, sólo contando a los lugares vacíos de la grilla 3x3).

Luego se dibuja la grilla, en la que se marca con una X al robot, y con un O al adversario. En cada step se agregan los 2 movimientos.

Luego se marca, si en ese movimiento se ha terminado, el resultado de la partida para nuestro robot.



In [22]:
obs = env.reset()
for i in range(100):
    action, _states = model.predict(obs)
    print(action)
    obs, rewards, dones, info = env.step(action)
    env.render(mode='console')
    if dones:
      obs = env.reset()
      print(info[0]['resultado'], 'mediante action=', action)
      print("-*-*-*-*-*-*-*-*-*-*-*-*-")
    else:
      print("-------")

[2]
- - X 
- O - 
- - - 
-------
[3]
- - X 
- O X 
- - O 
-------
[0]
X O X 
- O X 
- - O 
-------
[2]
X O X 
O O X 
- X O 
-------
[1]
- - - 
- - - 
- - - 
Empate mediante action= [1]
-*-*-*-*-*-*-*-*-*-*-*-*-
[2]
O - X 
- - - 
- - - 
-------
[1]
O - X 
X - - 
- - O 
-------
[1]
O - X 
X X O 
- - O 
-------
[1]
- - - 
- - - 
- - - 
Victoria mediante action= [1]
-*-*-*-*-*-*-*-*-*-*-*-*-
[2]
- O X 
- - - 
- - - 
-------
[0]
X O X 
- - - 
O - - 
-------
[1]
X O X 
- X - 
O - O 
-------
[2]
X O X 
O X - 
O X O 
-------
[0]
- - - 
- - - 
- - - 
Empate mediante action= [0]
-*-*-*-*-*-*-*-*-*-*-*-*-
[2]
- - X 
- - - 
- - O 
-------
[1]
O X X 
- - - 
- - O 
-------
[1]
O X X 
- X - 
- O O 
-------
[2]
- - - 
- - - 
- - - 
Victoria mediante action= [2]
-*-*-*-*-*-*-*-*-*-*-*-*-
[2]
- - X 
- - - 
- - O 
-------
[1]
O X X 
- - - 
- - O 
-------
[1]
O X X 
- X - 
- O O 
-------
[2]
- - - 
- - - 
- - - 
Victoria mediante action= [2]
-*-*-*-*-*-*-*-*-*-*-*-*-
[2]
O - X 
- - - 
- - - 
-------
[1]
O

## Ejercicio 2

### Entorno 4x4

In [18]:
class TicTacToeAdvanced(TicTacToe):
    """
    Ambiente personalizado que sigue la interfaz de gym.
    Es un entorno más complejo en el cuál el agente debe aprender ganarle a un adversario simple
    """
    metadata = {'render.modes': ['console']}

    def __init__(self, grid_size=10):
        super(TicTacToe, self).__init__()
    
        self.observation_space = Box(low=np.zeros(16), high=np.full((16, ), 4))
        self.state = [
            0,0,0,0,
            0,0,0,0,
            0,0,0,0,
            0,0,0,0
        ]
        self.win_combs = [
            [0,1,2,3], [4,5,6,7], [8,9,10,11], [12,13,14,15], # filas
            [0,4,8,12],[1,5,9,13],[2,6,10,14], [3,7,11,15], # cols
            [0,5,10,15],[3,6,9,12] # diags
        ]

    def check_win(self):
        for comb in self.win_combs:
            if self.state[comb[0]] == 1 and self.state[comb[1]] == 1 \
                and self.state[comb[2]] == 1 and self.state[comb[3]] == 1:
                return True
        return False 

 
    def check_lose(self):
        for comb in self.win_combs:
            if self.state[comb[0]] == 2 and self.state[comb[1]] == 2 \
                and self.state[comb[2]] == 2 and self.state[comb[3]] == 2:
                return True
        return False 

    def reset(self):
        self.state = [
            0,0,0,0,
            0,0,0,0,
            0,0,0,0,
            0,0,0,0
        ]

        return np.array([self.state]).astype(np.float32)

    def render(self, mode='console'):
        if mode != 'console':
            raise NotImplementedError()
        # en nuestra interfaz de consola, representamos el agente como una cruz, y 
        # el resto como un punto
        draw = ["-", "X", "O"]
        drawing = [draw[st] for st in self.state]
        for i in range(0,16,4):
            if i == 0:
                print(f"{drawing[i]} {drawing[i+1]} {drawing[i+2]} {drawing[i+3]}")
            if i == 4:
                print(f"{drawing[i]} {drawing[i+1]} {drawing[i+2]} {drawing[i+3]}")
            if i == 8:
                print(f"{drawing[i]} {drawing[i+1]} {drawing[i+2]} {drawing[i+3]}")
            if i == 12:
                print(f"{drawing[i]} {drawing[i+1]} {drawing[i+2]} {drawing[i+3]}")

In [19]:
env = TicTacToeAdvanced()
env = make_vec_env(lambda: env, n_envs=1)
model = PPO('MlpPolicy', env, verbose=1).learn(250_000)



Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 8.21     |
|    ep_rew_mean     | -946     |
| time/              |          |
|    fps             | 2053     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 7.79         |
|    ep_rew_mean          | -901         |
| time/                   |              |
|    fps                  | 1446         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0018705858 |
|    clip_fraction        | 0.000391     |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.77        |
|    explained_variance   | 1.56e-05     

In [20]:
model.save("TicTacToeAdvanced")

In [21]:
victorias = 0
empates = 0
derrotas = 0

obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    if dones:
      if info[0]['resultado'] == 'Victoria':
        victorias += 1
      if info[0]['resultado'] == 'Empate':
        empates += 1
      if info[0]['resultado'] == 'Derrota':
        derrotas += 1
print('Victorias = ', victorias/(victorias+empates+derrotas))     
print('Empates = ', empates/(victorias+empates+derrotas))     
print('Derrotas = ', derrotas/(victorias+empates+derrotas))

Victorias =  0.023622047244094488
Empates =  0.8346456692913385
Derrotas =  0.14173228346456693


In [22]:
obs = env.reset()
for i in range(100):
    action, _states = model.predict(obs)
    print(action)
    obs, rewards, dones, info = env.step(action)
    env.render(mode='console')
    if dones:
      obs = env.reset()
      print(info[0]['resultado'], 'mediante action=', action)
      print("-*-*-*-*-*-*-*-*-*-*-*-*-")
    else:
      print("-------")

[10]
- - - -
- - - -
- - X -
- O - -
-------
[0]
X - - -
- O - -
- - X -
- O - -
-------
[0]
X X O -
- O - -
- - X -
- O - -
-------
[0]
X X O X
- O O -
- - X -
- O - -
-------
[0]
X X O X
X O O -
O - X -
- O - -
-------
[0]
X X O X
X O O X
O - X O
- O - -
-------
[0]
X X O X
X O O X
O X X O
- O - O
-------
[0]
- - - -
- - - -
- - - -
- - - -
Empate mediante action= [0]
-*-*-*-*-*-*-*-*-*-*-*-*-
[8]
- - - -
- - O -
X - - -
- - - -
-------
[8]
- - - -
- - O -
X O X -
- - - -
-------
[8]
- - - O
- - O -
X O X -
X - - -
-------
[10]
- O - O
- - O -
X O X -
X - - -
-------
[0]
X O - O
O - O -
X O X -
X - - -
-------
[0]
X O X O
O O O -
X O X -
X - - -
-------
[0]
X O X O
O O O X
X O X O
X - - -
-------
[0]
X O X O
O O O X
X O X O
X X O -
-------
[0]
- - - -
- - - -
- - - -
- - - -
Empate mediante action= [0]
-*-*-*-*-*-*-*-*-*-*-*-*-
[10]
- O - -
- - - -
- - X -
- - - -
-------
[0]
X O - -
- O - -
- - X -
- - - -
-------
[0]
X O X -
- O O -
- - X -
- - - -
-------
[0]
X O X X
O O O -
- - X

### Entorno con posiciones penalizadas

En este modelo se penaliza al jugador si su jugada comienza en la posición central.

In [33]:
class TicTacToeDisabled(TicTacToe):
    """
    Ambiente personalizado que sigue la interfaz de gym.
    Es un entorno más complejo en el cuál el agente debe aprender ganarle a un 
    adversario simple. En este entorno, no se permite marcar en la posición 
    central para comenzar la partida.
    """
    metadata = {'render.modes': ['console']}

    def __init__(self, grid_size=10):
        super(TicTacToe, self).__init__()

        self.is_first_move = True
        self.disabled_position = 4
        self.observation_space = Box(low=np.zeros(9), high=np.full((9, ), 3))
        self.state = [
            0,0,0,
            0,0,0,
            0,0,0,
        ]
        self.win_combs = [
            [0,1,2],[3,4,5],[6,7,8], # filas
            [0,3,6],[1,4,7],[2,5,8], # cols
            [0,4,8],[2,4,6] # diags
        ] 

    def step(self, action):
        reward = 0
        pos = 0
        zpos = 0
        info = {}
        # Un valor de action=3 implica poner una Cruz en el tercer lugar vacío (No en el 3er lugar de la grilla, sino el 3ero de los vacíos)
        # Esto se ha planteado así para poder aprovechar la función Discrete, teniendo en cuenta que el espacio de acciones varía step a step.
        for st in self.state: 
            if st == 0:
                if zpos == action:
                    self.state[pos] = 1
                zpos += 1
            position = pos
            pos += 1  

        if self.check_disable_first_move(position):
            reward -= 500

        if self.check_win():
            reward += 1000
            done = True
            info['resultado'] = 'Victoria'
        elif self.check_draw():
            reward = reward -100
            done=True
            info['resultado'] = 'Empate'
        else:
            self.player2() #Función con IA del adversario
            if self.check_lose():
                reward -= 1000
                done = True
                info['resultado'] = 'Derrota'
            elif self.check_draw():
                reward = reward -100
                done=True
                info['resultado'] = 'Empate'
            else:
                done = False  
        
        return np.array([self.state]).astype(np.float32), reward, done, info

    def check_disable_first_move(self, pos):
        if pos == self.disabled_position and self.is_first_move:
            self.is_first_move = False
            return True
        else:
            self.is_first_move = False
            return False

    def reset(self):
        self.state = [
            0,0,0,
            0,-1,0,
            0,0,0,
        ]
        return np.array([self.state]).astype(np.float32)

In [34]:
env = TicTacToeDisabled()
env = make_vec_env(lambda: env, n_envs=1)
model = PPO('MlpPolicy', env, verbose=1).learn(250_000)



Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 4.24     |
|    ep_rew_mean     | -775     |
| time/              |          |
|    fps             | 2013     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.27        |
|    ep_rew_mean          | -784        |
| time/                   |             |
|    fps                  | 1438        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.006441423 |
|    clip_fraction        | 0.0274      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.19       |
|    explained_variance   | -0.000132   |
|    learning

In [35]:
model.save("TicTacToeDisabled")

In [36]:
victorias = 0
empates = 0
derrotas = 0

obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    if dones:
      if info[0]['resultado'] == 'Victoria':
        victorias += 1
      if info[0]['resultado'] == 'Empate':
        empates += 1
      if info[0]['resultado'] == 'Derrota':
        derrotas += 1
print('Victorias = ', victorias/(victorias+empates+derrotas))     
print('Empates = ', empates/(victorias+empates+derrotas))     
print('Derrotas = ', derrotas/(victorias+empates+derrotas))

Victorias =  0.132
Empates =  0.852
Derrotas =  0.016


In [37]:
obs = env.reset()
for i in range(100):
    action, _states = model.predict(obs)
    print(action)
    obs, rewards, dones, info = env.step(action)
    env.render(mode='console')
    if dones:
      obs = env.reset()
      print(info[0]['resultado'], 'mediante action=', action)
      print("-*-*-*-*-*-*-*-*-*-*-*-*-")
    else:
      print("-------")

[4]
- - - 
- O X 
- - O 
-------
[0]
X - - 
O O X 
- - O 
-------
[1]
X O X 
O O X 
- - O 
-------
[1]
- - - 
- O - 
- - - 
Empate mediante action= [1]
-*-*-*-*-*-*-*-*-*-*-*-*-
[5]
- O - 
- O - 
X - - 
-------
[0]
X O - 
O O - 
X - - 
-------
[1]
X O - 
O O X 
X - O 
-------
[1]
- - - 
- O - 
- - - 
Empate mediante action= [1]
-*-*-*-*-*-*-*-*-*-*-*-*-
[4]
O - - 
- O X 
- - - 
-------
[1]
O - X 
- O X 
- - O 
-------
[1]
O O X 
X O X 
- - O 
-------
[1]
- - - 
- O - 
- - - 
Empate mediante action= [1]
-*-*-*-*-*-*-*-*-*-*-*-*-
[4]
- - - 
- O X 
O - - 
-------
[0]
X O - 
- O X 
O - - 
-------
[1]
X O O 
X O X 
O - - 
-------
[1]
- - - 
- O - 
- - - 
Empate mediante action= [1]
-*-*-*-*-*-*-*-*-*-*-*-*-
[4]
- O - 
- O X 
- - - 
-------
[0]
X O - 
- O X 
- - O 
-------
[1]
X O - 
X O X 
O - O 
-------
[1]
- - - 
- O - 
- - - 
Empate mediante action= [1]
-*-*-*-*-*-*-*-*-*-*-*-*-
[0]
X - - 
- O - 
- - O 
-------
[1]
X O X 
- O - 
- - O 
-------
[1]
X O X 
- O X 
- O O 
-------
[1]
- - - 
