In [None]:
# Let's do independent Q-learning in Airline Seats, and play it against random.
# RL is based on python/examples/independent_tabular_qlearning.py
from open_spiel.python import rl_environment
from open_spiel.python import rl_tools
from open_spiel.python.algorithms import tabular_qlearner
from open_spiel.python.algorithms import random_agent

# Create the environment
env = rl_environment.Environment("airline_seats")
num_players = env.num_players
num_actions = env.action_spec()["num_actions"]

# Create the agents
agents = [
    tabular_qlearner.QLearner(player_id=0, num_actions=num_actions),
    random_agent.RandomAgent(1, num_actions, "Entropy Master 2000")
]
# Train the Q-learning agents in play vs random.
ep_rewards = [0.0,0.0]
wins = 0.0
games = 0.0
for cur_episode in range(5000):
  time_step = env.reset()
  while not time_step.last():
    player_id = time_step.observations["current_player"]
    agent_output = agents[player_id].step(time_step)
    time_step = env.step([agent_output.action])
    ep_rewards[0] += time_step.rewards[0]
    ep_rewards[1] += time_step.rewards[1]
    if(ep_rewards[0] > ep_rewards[1]):
      wins+=1
    games+=1
  if cur_episode % 1000 == 0:
    print(f"Episode {cur_episode}Rewards agent: {ep_rewards[0]/1000} - random: {ep_rewards[1]/1000} | win rate: {wins/games}")
    wins = 0.0
    games = 0.0
    ep_rewards = [0,0]
    
  # Episode is over, step all agents with final info state.
  for agent in agents:
    agent.step(time_step)
print("Done!")
# Evaluate the Q-learning agent against a random agent.

eval_agents = [agents[0], random_agent.RandomAgent(1, num_actions, "Entropy Master 2000") ]

eval_rewards = [0.0, 0.0]
for i in range(1000):
    time_step = env.reset()
    while not time_step.last():
      player_id = time_step.observations["current_player"]
      # Note the evaluation flag. A Q-learner will set epsilon=0 here.
      agent_output = eval_agents[player_id].step(time_step, is_evaluation=True)
      time_step = env.step([agent_output.action])
    eval_rewards[0] += time_step.rewards[0]
    eval_rewards[1] += time_step.rewards[1]
# Rewards over 1000 simulations
print(f"Agent: {eval_rewards[0]/1000} Random: {eval_rewards[1]/1000}")

# Play one game
time_step = env.reset()
while not time_step.last():
  print("")
  print(env.get_state)
  player_id = time_step.observations["current_player"]
  # Note the evaluation flag. A Q-learner will set epsilon=0 here.
  agent_output = eval_agents[player_id].step(time_step, is_evaluation=True)
  print(f"Agent {player_id} chooses {env.get_state.action_to_string(agent_output.action)}")
  time_step = env.step([agent_output.action])
print(env.get_state)
print(time_step.rewards)