# Environment code
Below, you can see the code I've written for the simplified Pokemon environment. The main difference is I have removed the Speed parameter for each Pokemon, and instead I break turn order randomly.

Step unit tests and Episode unit tests can be found in the next sections.

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
start_health = 4

def start_state():
  return np.array([
    [start_health, start_health, start_health, 0],
    [start_health, start_health, start_health, 0],
  ])

def end_state():
  return np.zeros((2,4))

# type advantage, if attacker is 1 and target is 0, double damage
def assign_damage(state, action, target_player):
  active_index = state[target_player][3]
  
  # opp_index refers to index of *the attacker*
  opp_index = state[(target_player*-1)+1][3]
  modifier = 1
  
  # type modifiers
  if active_index == 0 and opp_index == 1:
    modifier = 2
  if active_index == 1 and opp_index == 2:
    modifier = 2
  if active_index == 2 and opp_index == 0:
    modifier = 2
  if active_index == 1 and opp_index == 0:
    modifier = 0.5
  if active_index == 2 and opp_index == 1:
    modifier = 0.5
  if active_index == 0 and opp_index == 2:
    modifier = 0.5
  
  updated_health = 0
  
  if action == 3:
    damage = int(2 * modifier)
    updated_health = max(state[target_player][active_index]-damage, 0)
    state[target_player][active_index] = updated_health
    
  if action == 4:
    if np.random.random() > 0.5:
      damage = int(4 * modifier)
      updated_health = max(state[target_player][active_index]-damage, 0)
      state[target_player][active_index] = updated_health
  
  return state

# action: 0 = swap to 0, 1 = swap to 1, 2 = swap to 2, 3 = weak attack, 4 = strong attack
def step(state, action, opp):
  
  # get opponent's move
  opp_action = opp(state)
  
  # if player action is swap, see if swap is legal and apply it if so
  if action in [0,1,2]:
    if state[0][action] > 0:
      state[0][3] = action
  
  # if opponent action is swap, see if swap is legal and apply it if so
  if opp_action in [0,1,2]:
    if state[1][opp_action] > 0:
      state[1][3] = opp_action
  
  # figure out who goes first, assume 50/50
  player_goes_first = np.random.random() > 0.5
  
  # get active pokemon indices
  opp_active_index = state[1][3]
  player_active_index = state[0][3]
  
  # handle the damage calculation
  if player_goes_first:
    state = assign_damage(state, action, 1)
    # only let opponent move if they survived
    if state[1][opp_active_index] > 0:
      state = assign_damage(state, opp_action, 0)
  else:
    state = assign_damage(state, opp_action, 0)
    # only let player move if they survived
    if state[0][player_active_index] > 0:
      state = assign_damage(state, action, 1)
      
  # check if all fainted on either side, opponent-first
  if np.sum(state[1][0:3]) == 0:
    return 1, end_state()
  if np.sum(state[0][0:3]) == 0:
    return -1, end_state()
  
  # swap out fainted pokemon on either side, opponent-first
  indices = np.array([0,1,2])
  np.random.shuffle(indices)
  if state[1][opp_active_index] == 0:
    for i in indices:
      if state[1][i] > 0:
        state[1][3] = i
        break
        
  np.random.shuffle(indices)
  if state[0][player_active_index] == 0:
    for i in indices:
      if state[0][i] > 0:
        state[0][3] = i
        break
  return 0, state

def run_episode(agent, opponent):
  num_trials = 0
  state = start_state()
  action = agent(state)
  reward = 0
  while reward == 0:
    reward, state = step(state, action, opponent)
    action = agent(state)
    num_trials += 1
    if num_trials > 500:
      raise IndexError
  
  return reward, state

# Single step unit tests

Below are some sanity checks to ensure that the state transitions are working as expected when players try to do an attack. Mostly just making sure that the HP and active indicator update as expected.

In [344]:
# If player selects weak attack (3), enemy active pokemon should have 2 less health
opponent = lambda x: 0
reward, state = step(start_state(), 3, opponent)
assert(np.array_equal(state, np.array([
  [4,4,4,0],
  [2,4,4,0]
])))

# If both players choose weak attack, enemy chooses weak attack, both should have 2 health
opponent = lambda x: 3
reward, state = step(start_state(), 3, opponent)
assert(np.array_equal(state, np.array([
  [2,4,4,0],
  [2,4,4,0]
])))

# Player attacks
# If type disadvantage, expect half damage
state = assign_damage(np.array([
  [4,4,4,0],
  [4,4,4,1]
]), 3, 1)
assert(np.array_equal(state, np.array([
  [4,4,4,0],
  [4,3,4,1]
])))

# If type advantage, expect double damage
state = assign_damage(np.array([
  [4,4,4,0],
  [4,4,4,2]
]), 3, 1)
assert(np.array_equal(state, np.array([
  [4,4,4,0],
  [4,4,0,2]
])))

# If type is the same, expect normal damage
state = assign_damage(np.array([
  [4,4,4,0],
  [4,4,4,0]
]), 3, 1)
assert(np.array_equal(state, np.array([
  [4,4,4,0],
  [2,4,4,0]
])))

# Full episode unit tests
Below, I've run entire episodes for simple strategies from both players. In these cases, we should expect the number of wins from both sides to be about equal, which I've done by simply summing up the rewards (which should be +/-1) and then dividing by number of episodes.

In [345]:
# If player attacks (3/4), and opponent always swaps (0/1/2) the player should always win
for swap in [0,1,2]:
  for attack in [3,4]:
    agent = lambda x: attack
    opponent = lambda x: swap
    for i in range(10):
      reward, state = run_episode(agent, opponent)
      assert(reward == 1)
      assert(np.sum(state) == 0)
      
# If opponent attacks (3/4), and player always swaps (0/1/2) the opponent should always win
for swap in [0,1,2]:
  for attack in [3,4]:
    agent = lambda x: swap
    opponent = lambda x: attack
    for i in range(10):
      reward, state = run_episode(agent, opponent)
      assert(reward == -1)
      assert(np.sum(state) == 0)

In [346]:
# Equally matched test
agent = lambda x: 0 if np.random.random() > 0.5 else 3
opponent = agent
rewards = []
num_trials = 1000
for i in range(num_trials):
  reward, state = run_episode(agent, opponent)
  rewards.append(reward)
# this should be a small number because we expect each side to win about half the time
print(np.sum(rewards)/num_trials)

0.046


In [347]:
# Deterministic matched test, if both players attack, 
# Equally matched test
agent = lambda x: 3
opponent = agent
rewards = []
num_trials = 1000
for i in range(num_trials):
  reward, state = run_episode(agent, opponent)
  rewards.append(reward)
# this should be a small number because we expect each side to win about half the time
print(np.sum(rewards)/num_trials)

0.008


In [348]:
# Equally matched test with strong attack
agent = lambda x: 4
opponent = agent
rewards = []
num_trials = 1000
for i in range(num_trials):
  reward, state = run_episode(agent, opponent)
  rewards.append(reward)
# this should be a small number because we expect each side to win about half the time
print(np.sum(rewards)/num_trials)

-0.006


In [351]:
# Equally matched test across all choices
agent = lambda x: int(np.random.random()*4)
opponent = lambda x: int(np.random.random()*4)
rewards = []
num_trials = 1000
for i in range(num_trials):
  reward, state = run_episode(agent, opponent)
  rewards.append(reward)
# this should be a small number because we expect each side to win about half the time
print(np.sum(rewards)/num_trials)

0.004
