In [4]:
# Let's do independent Q-learning in Tic-Tac-Toe, and play it against random.
# RL is based on python/examples/independent_tabular_qlearning.py
from open_spiel.python import rl_environment
from open_spiel.python import rl_tools
from open_spiel.python.algorithms import tabular_qlearner

# Create the environment
env = rl_environment.Environment("airline_seats")
num_players = env.num_players
num_actions = env.action_spec()["num_actions"]

# Create the agents
agents = [
    tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions)
    for idx in range(num_players)
]
# Train the Q-learning agents in self-play.
wins = 0.0
lossties = 0.0
ep_rewards = 0.0
for cur_episode in range(25000):
  time_step = env.reset()
  while not time_step.last():
    player_id = time_step.observations["current_player"]
    agent_output = agents[player_id].step(time_step)
    time_step = env.step([agent_output.action])
  if time_step.rewards[0] > time_step.rewards[1]:
    wins += 1
  else:
    lossties += 1
  ep_reward = time_step.rewards[0]
  ep_rewards += ep_reward
  if cur_episode %1000 == 0:
    print(f"Episodes: {cur_episode} Reward: {ep_rewards/1000} Win ratio: {wins/(wins+lossties)}")
    wins = 0.0
    lossties = 0.0
    ep_rewards = 0.0
    
  # Episode is over, step all agents with final info state.
  for agent in agents:
    agent.step(time_step)
print("Done!")
# Evaluate the Q-learning agent against a random agent.
from open_spiel.python.algorithms import random_agent
eval_agents = [agents[0], random_agent.RandomAgent(1, num_actions, "Entropy Master 2000") ]

time_step = env.reset()
while not time_step.last():
  print("")
  print(env.get_state)
  player_id = time_step.observations["current_player"]
  # Note the evaluation flag. A Q-learner will set epsilon=0 here.
  agent_output = eval_agents[player_id].step(time_step, is_evaluation=True)
  print(f"Agent {player_id} chooses {env.get_state.action_to_string(agent_output.action)}")
  time_step = env.step([agent_output.action])

print("")
print(env.get_state)
print(time_step.rewards)

Episodes: 0 Reward: -4.88 Win ratio: 0.0
Episodes: 1000 Reward: -3594.822 Win ratio: 0.469
Episodes: 2000 Reward: -3514.349 Win ratio: 0.51
Episodes: 3000 Reward: -3519.448 Win ratio: 0.531
Episodes: 4000 Reward: -3547.399 Win ratio: 0.516
Episodes: 5000 Reward: -3487.281 Win ratio: 0.508
Episodes: 6000 Reward: -3551.246 Win ratio: 0.494
Episodes: 7000 Reward: -3536.918 Win ratio: 0.507
Episodes: 8000 Reward: -3518.177 Win ratio: 0.509
Episodes: 9000 Reward: -3609.823 Win ratio: 0.489
Episodes: 10000 Reward: -3500.971 Win ratio: 0.524
Episodes: 11000 Reward: -3419.015 Win ratio: 0.538
Episodes: 12000 Reward: -3529.369 Win ratio: 0.483
Episodes: 13000 Reward: -3487.564 Win ratio: 0.494
Episodes: 14000 Reward: -3501.737 Win ratio: 0.486
Episodes: 15000 Reward: -3537.459 Win ratio: 0.497
Episodes: 16000 Reward: -3466.366 Win ratio: 0.52
Episodes: 17000 Reward: -3539.072 Win ratio: 0.512
Episodes: 18000 Reward: -3400.523 Win ratio: 0.534
Episodes: 19000 Reward: -3506.404 Win ratio: 0.494
E