<a href="https://colab.research.google.com/github/mcnica89/Markov-Chains-RL-W25/blob/main/Copy_of_MonteCarloPolicyEvaluation_ZombieDice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from tqdm import tqdm
np.set_printoptions(precision=2)

# Simulation

In [None]:
#brains shotguns
max_brains = 9 #there are only 13 dice! so max score is 13.
max_shotguns = 2 #you immediatly lose if you exceed this!

def num_to_words(my_array):
    words = ['Brain', 'Shotgun', 'Feet']

    # Using nditer to handle multi-dimensional arrays
    result = np.empty_like(my_array, dtype=object)

    it = np.nditer(my_array, flags=['multi_index'])
    for idx in it:
        result[it.multi_index] = words[int(my_array[it.multi_index])]

    return result

def dice_roll_history(): #secret!
  '''Returns a list of all the outcomes of the dice using the code 0=Brain, 1=Shotgun, 2=Feet'''
  dice_types = np.array([3,4,6]) #number of each type of dice
  dice_probs = 1.0/6.0*np.array([[1,3,2],[2,2,2],[3,1,2]]) #the number of sides of each type on the 3 dice types.
  #order is brains,shotguns,feet

  t_max = np.sum(dice_types)-3 #this is the maximum length we can be gaurenteed to not run out of dice for.

  history = np.ones((t_max,3),dtype=int)
  def draw_a_new_dice():
    '''returns a new dice from the box, one of the three types, and removes it from dice_types'''
    nonlocal dice_types
    #print(f"{dice_types=}")
    p_dice = dice_types/np.sum(dice_types)
    #print(p_dice)
    choice = np.random.choice( 3,p=p_dice)
    dice_types[choice] -= 1
    if np.sum(dice_types) == 0:
      dice_types = np.array([3,4,6]) #refill the cup if needed
    return choice

  def roll_a_dice(dice_type):
    '''roll a dice of type dice_type and return the result order is brains, shotgun, feet'''
    return np.random.choice( 3,p=dice_probs[dice_type])

  #intialize by drawing three dice
  current_dice = np.array([draw_a_new_dice() for i in range(3)])

  #roll dice to create a history
  for t in range(t_max):
    for i in range(3):
      this_roll =  int(roll_a_dice(current_dice[i]))
      history[t,i] = this_roll
      if this_roll == 0 or this_roll == 1: #for brains or shotguns, draw a new dice
        current_dice[i] = draw_a_new_dice()
  return history

In [None]:
my_history = dice_roll_history()
print(num_to_words(my_history))
print(my_history)

[['Brain' 'Feet' 'Shotgun']
 ['Brain' 'Brain' 'Shotgun']
 ['Shotgun' 'Brain' 'Brain']
 ['Feet' 'Brain' 'Shotgun']
 ['Feet' 'Shotgun' 'Feet']
 ['Brain' 'Shotgun' 'Feet']
 ['Brain' 'Brain' 'Brain']
 ['Brain' 'Shotgun' 'Brain']
 ['Brain' 'Shotgun' 'Feet']
 ['Feet' 'Brain' 'Brain']]
[[0 2 1]
 [0 0 1]
 [1 0 0]
 [2 0 1]
 [2 1 2]
 [0 1 2]
 [0 0 0]
 [0 1 0]
 [0 1 2]
 [2 0 0]]


# Policy Evalution

In [None]:
#### Policy evaluation! ###

# Setup number of episodes and maximum length of an episode
num_epsiodes = 10_000
t_max = 10

# Setup the value function
v = np.zeros((max_brains+1,max_shotguns+1))

# Setup the policy! Policy is fixed for now.
policy = np.zeros((max_brains+1,max_shotguns+1)) #where to roll again and where to stop.
brains,shotguns = np.indices((max_brains+1,max_shotguns+1))
policy[brains,shotguns] = (brains < 6) #reroll if brains<5 otherwise stop


# Number of visits to each state.
visits = np.zeros((max_brains+1,max_shotguns+1),dtype=int)

# Main Monte Carlo loop

verbose = False #whether or not to output text as we go
for episode in tqdm(range(num_epsiodes)):
  # Start a new epsiode!
  if verbose: print(f"Episode #{episode}...")
  this_episode_visits = np.zeros((max_brains+1,max_shotguns+1),dtype=int)
  brain_state = int(0) #starting state of the number of brains, shotguns we've seen.
  shotgun_state = int(0)
  my_history = dice_roll_history() #generates all the rolls we need for the episode

  for t in range(t_max): #go through the rounds t.
    if verbose: print(f".{t=}, Brains:{brain_state}, Shotguns:{shotgun_state}")
    this_episode_visits[brain_state,shotgun_state] += 1

    if policy[brain_state,shotgun_state] == 1:
      if verbose: print(f"..Chose to Reroll! Roll={num_to_words(my_history[t])}")
      #chose to roll again!
      num_brains = np.sum(my_history[t] == 0)
      num_shotguns = np.sum(my_history[t] == 1)
      brain_state += num_brains
      shotgun_state += num_shotguns

      # if we get too many shotguns, we lose.
      if shotgun_state > max_shotguns:
        if verbose:print("Went bust :(")
        break

    elif policy[brain_state,shotgun_state] == 0:
      if verbose:print(f"..Chose to Stop! Final Brains={brain_state}")
      #chose to stop
      break

  #calculate final rewards here
  if shotgun_state <= max_shotguns:
    reward = brain_state
  else:
    reward = 0

  #update the number of visits to each state and the running average of v.
  v = np.where(this_episode_visits,v + 1/(visits+1)*(reward - v),v)
  visits = np.where(this_episode_visits,visits+1,visits)

  if verbose:print(f"{v=}")

print(f"Policy: \n{policy}")
print(f"v \n{v}")
print(f"visits \n{visits}")


100%|██████████| 10000/10000 [00:23<00:00, 430.73it/s]

Policy: 
[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
v 
[[1.15 0.49 0.16]
 [1.54 0.81 0.27]
 [2.4  1.28 0.41]
 [3.09 1.99 0.76]
 [4.18 2.8  1.43]
 [5.57 4.52 1.96]
 [6.   6.   6.  ]
 [7.   7.   7.  ]
 [8.   8.   8.  ]
 [0.   0.   0.  ]]
visits 
[[10000  1017   995]
 [ 1273  2580  1754]
 [ 1550  2283  1647]
 [  884  1307  1777]
 [  420  1048  1397]
 [  224   681  1023]
 [  134   361   704]
 [   36   182   278]
 [   10    40    49]
 [    0     0     0]]





# Policy Improvement

In [None]:
#Policy improvement!

# Setup number of episodes and maximum length of an episode
num_episodes = 1000
t_max = 10

# Setup the q functions now
q_stay = np.zeros((max_brains+1,max_shotguns+1))
q_roll = np.zeros((max_brains+1,max_shotguns+1))

#note that now the visits depend on the action you took.
visits_stay = np.zeros((max_brains+1,max_shotguns+1),dtype=int)
visits_roll = np.zeros((max_brains+1,max_shotguns+1),dtype=int)

#policy! fixed for now.
policy = np.zeros((max_brains+1,max_shotguns+1),dtype=int) #where to roll again and where to stop.
brains,shotguns = np.indices((max_brains+1,max_shotguns+1))
policy[brains,shotguns] = (brains < 5) #reroll if brains<5 otherwise stop

#Main Monte Carlo Loop
verbose = False
for episode in tqdm(range(num_episodes)):

  policy = (q_roll >= q_stay)

  #Initialize Episode
  if verbose: print(f"Episode #{episode}...")
  this_episode_visits_stay = np.zeros((max_brains+1,max_shotguns+1),dtype=int)
  this_episode_visits_roll = np.zeros((max_brains+1,max_shotguns+1),dtype=int)
  my_history = dice_roll_history()
  brain_state = int(0) #starting state of the number of brains, shotguns we've seen.
  shotgun_state = int(0)

  #Loop over times t
  for t in range(t_max):
    if verbose: print(f".{t=}, Brains:{brain_state}, Shotguns:{shotgun_state}")

    #get the action from the policy
    action = policy[brain_state, shotgun_state]


    if action == 1:
      if verbose: print(f"..Chose to Reroll! Roll={num_to_words(my_history[t])}")
      this_episode_visits_roll[brain_state,shotgun_state] += 1

      #chose to roll again!
      num_brains = np.sum(my_history[t] == 0)
      num_shotguns = np.sum(my_history[t] == 1)
      brain_state = min(max_brains, brain_state+num_brains)
      shotgun_state += num_shotguns

      if shotgun_state > max_shotguns:
        if verbose:print("Went bust :(")
        break

    elif action == 0:
      this_episode_visits_stay[brain_state,shotgun_state] += 1
      if verbose:print(f"..Chose to Stop! Final Brains={brain_state}")
      #chose to stop
      break

  if shotgun_state <= max_shotguns:
    reward = brain_state
  else:
    reward = 0

  q_stay = np.where(this_episode_visits_stay,q_stay + 1/(visits_stay+1)*(reward - q_stay),q_stay)
  visits_stay = np.where(this_episode_visits_stay,visits_stay+1,visits_stay)

  q_roll = np.where(this_episode_visits_roll,q_roll+ 1/(visits_roll+1)*(reward - q_roll),q_roll)
  visits_roll = np.where(this_episode_visits_roll,visits_roll+1,visits_roll)

  if verbose:print(f"{q_stay=}")
  if verbose:print(f"{q_roll=}")


print("")
print(f"q_stay:\n {q_stay}")
print(f"visits_stay: \n {visits_stay}")
print(f"q_roll:\n {q_roll}")
print(f"visits_roll:\n {visits_roll}")

print(f"policy: \n {policy}")


100%|██████████| 1000/1000 [00:02<00:00, 490.60it/s]


q_stay:
 [[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
visits_stay: 
 [[0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
q_roll:
 [[0.01 0.   0.  ]
 [0.   0.   0.  ]
 [0.   0.04 0.  ]
 [0.   0.   0.  ]
 [0.   0.08 0.  ]
 [0.   0.   0.  ]
 [0.   0.23 0.  ]
 [0.   0.   0.  ]
 [0.   0.69 0.  ]
 [0.   0.9  0.  ]]
visits_roll:
 [[1000   80   85]
 [ 141  269  178]
 [ 141  237  158]
 [  92  125  161]
 [  42  112  133]
 [  33   70  112]
 [  12   39   79]
 [   3   28   65]
 [   6   13   42]
 [   2   10   34]]
policy: 
 [[ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]]





# Epsilon Greedy Learning

In [None]:
def epsilon_random(policy_action,epsilon):
  if np.random.random() < epsilon: #with probability epislon, make it purely random
    return np.random.randint(2)
  else: #otherwise return the policy
    return policy_action

In [None]:
#Policy improvement!

# Setup number of episodes and maximum length of an episode
num_episodes = 10_000
t_max = 10

# Setup the q functions now
q_stay = np.zeros((max_brains+1,max_shotguns+1))
q_roll = np.zeros((max_brains+1,max_shotguns+1))

#note that now the visits depend on the action you took.
visits_stay = np.zeros((max_brains+1,max_shotguns+1),dtype=int)
visits_roll = np.zeros((max_brains+1,max_shotguns+1),dtype=int)

#policy! fixed for now.
policy = np.zeros((max_brains+1,max_shotguns+1),dtype=int) #where to roll again and where to stop.
brains,shotguns = np.indices((max_brains+1,max_shotguns+1))
policy[brains,shotguns] = (brains < 5) #reroll if brains<5 otherwise stop

#Main Monte Carlo Loop
verbose = False
for episode in tqdm(range(num_episodes)):

  policy = (q_roll >= q_stay)

  #Initialize Episode
  if verbose: print(f"Episode #{episode}...")
  this_episode_visits_stay = np.zeros((max_brains+1,max_shotguns+1),dtype=int)
  this_episode_visits_roll = np.zeros((max_brains+1,max_shotguns+1),dtype=int)
  my_history = dice_roll_history()
  brain_state = int(0) #starting state of the number of brains, shotguns we've seen.
  shotgun_state = int(0)

  #Loop over times t
  for t in range(t_max):
    if verbose: print(f".{t=}, Brains:{brain_state}, Shotguns:{shotgun_state}")

    #get the action from the policy
    if episode < 5000:
      epsilon = 0.5
    else:
      epsilon = 0.05
    action = epsilon_random(policy[brain_state, shotgun_state],epsilon)


    if action == 1:
      if verbose: print(f"..Chose to Reroll! Roll={num_to_words(my_history[t])}")
      this_episode_visits_roll[brain_state,shotgun_state] += 1

      #chose to roll again!
      num_brains = np.sum(my_history[t] == 0)
      num_shotguns = np.sum(my_history[t] == 1)
      brain_state = min(max_brains, brain_state+num_brains)
      shotgun_state += num_shotguns

      if shotgun_state > max_shotguns:
        if verbose:print("Went bust :(")
        break

    elif action == 0:
      this_episode_visits_stay[brain_state,shotgun_state] += 1
      if verbose:print(f"..Chose to Stop! Final Brains={brain_state}")
      #chose to stop
      break

  if shotgun_state <= max_shotguns:
    reward = brain_state
  else:
    reward = 0

  q_stay = np.where(this_episode_visits_stay,q_stay + 1/(visits_stay+1)*(reward - q_stay),q_stay)
  visits_stay = np.where(this_episode_visits_stay,visits_stay+1,visits_stay)

  q_roll = np.where(this_episode_visits_roll,q_roll+ 1/(visits_roll+1)*(reward - q_roll),q_roll)
  visits_roll = np.where(this_episode_visits_roll,visits_roll+1,visits_roll)

  if verbose:print(f"{q_stay=}")
  if verbose:print(f"{q_roll=}")


print("")
print(f"q_stay:\n {q_stay}")
print(f"visits_stay: \n {visits_stay}")
print(f"q_roll:\n {q_roll}")
print(f"visits_roll:\n {visits_roll}")

print(f"policy: \n {1.0*policy}")


100%|██████████| 10000/10000 [00:17<00:00, 573.72it/s]


q_stay:
 [[0. 0. 0.]
 [1. 1. 1.]
 [2. 2. 2.]
 [3. 3. 3.]
 [4. 4. 4.]
 [5. 5. 5.]
 [6. 6. 6.]
 [7. 7. 7.]
 [8. 8. 8.]
 [9. 9. 9.]]
visits_stay: 
 [[1405  101   97]
 [ 134  286 1266]
 [ 167  295  931]
 [  93  200  853]
 [  29  668  460]
 [  18  310  189]
 [   7  109   46]
 [   6   39   27]
 [  21   17    6]
 [   4    3    4]]
q_roll:
 [[1.97 1.28 0.54]
 [2.65 1.89 0.87]
 [3.39 2.47 1.39]
 [4.34 3.22 1.81]
 [5.29 3.34 1.3 ]
 [6.   3.77 2.19]
 [7.06 4.25 1.88]
 [7.14 4.5  2.33]
 [0.   5.67 4.  ]
 [6.75 4.5  0.  ]]
visits_roll:
 [[8644  805  736]
 [ 979 1905  163]
 [1237 1560  111]
 [ 620  798  108]
 [ 238   71   46]
 [ 146   35   21]
 [  81    4    8]
 [  29    4    3]
 [   0    3    2]
 [   4    2    3]]
policy: 
 [[1. 1. 1.]
 [1. 1. 0.]
 [1. 1. 0.]
 [1. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]





# Off Policy Learning - Policy Improvement

In [None]:
# A simple off policy policy improvement:
# Never Stop Rolling, *but* imagine what WOULD have happened if you chose to stop or roll again according to your

# Setup number of episodes and maximum length of an episode
num_episodes = 1000
t_max = 10

# Setup the q functions now
q_stay = np.zeros((max_brains+1,max_shotguns+1))
q_roll = np.zeros((max_brains+1,max_shotguns+1))

#note that now the visits depend on the action you took.
visits_stay = np.zeros((max_brains+1,max_shotguns+1),dtype=int)
visits_roll = np.zeros((max_brains+1,max_shotguns+1),dtype=int)

#policy! fixed for now.
policy = np.zeros((max_brains+1,max_shotguns+1),dtype=int) #where to roll again and where to stop.
brains,shotguns = np.indices((max_brains+1,max_shotguns+1))
policy[brains,shotguns] = (brains < 5) #reroll if brains<5 otherwise stop

#Main Monte Carlo Loop
verbose = False
for episode in tqdm(range(num_episodes)):

  policy = (q_roll > q_stay)

  #Initialize Episode
  if verbose: print(f"Episode #{episode}...")
  this_episode_visits_stay = np.zeros((max_brains+1,max_shotguns+1),dtype=int)
  this_episode_visits_roll = np.zeros((max_brains+1,max_shotguns+1),dtype=int)
  my_history = dice_roll_history()
  brain_state = int(0) #starting state of the number of brains, shotguns we've seen.
  shotgun_state = int(0)

  #Loop over times t
  for t in range(t_max):
    if verbose: print(f".{t=}, Brains:{brain_state}, Shotguns:{shotgun_state}")

    #get the action from the policy

    action = policy[brain_state, shotgun_state]

    if action == 1:
      if verbose: print(f"..Chose to Reroll! Roll={num_to_words(my_history[t])}")
      this_episode_visits_roll[brain_state,shotgun_state] += 1

      #chose to roll again!
      num_brains = np.sum(my_history[t] == 0)
      num_shotguns = np.sum(my_history[t] == 1)
      brain_state = min(max_brains, brain_state+num_brains)
      shotgun_state += num_shotguns


      if shotgun_state > max_shotguns:
        if verbose:print("Went bust :(")
        break



    elif action == 0:
      #this_episode_visits_stay[brain_state,shotgun_state] += 1
      if verbose:print(f"..Chose to Stop! Final Brains={brain_state}")
      this_episode_visits_stay[brain_state,shotgun_state] += 1
      #break
      reward = brain_state
      q_stay = np.where(this_episode_visits_stay,q_stay + 1/(visits_stay+1)*(reward - q_stay),q_stay)
      visits_stay = np.where(this_episode_visits_stay,visits_stay+1,visits_stay)
      q_roll = np.where(this_episode_visits_roll,q_roll+ 1/(visits_roll+1)*(reward - q_roll),q_roll)
      visits_roll = np.where(this_episode_visits_roll,visits_roll+1,visits_roll)
      this_episode_visits_stay = np.zeros((max_brains+1,max_shotguns+1),dtype=int)
      this_episode_visits_roll = np.zeros((max_brains+1,max_shotguns+1),dtype=int)

      if verbose: print(f"..Simulating rolling again anyways! Roll={num_to_words(my_history[t])}")
      this_episode_visits_roll[brain_state,shotgun_state] += 1

      #chose to roll again!
      num_brains = np.sum(my_history[t] == 0)
      num_shotguns = np.sum(my_history[t] == 1)
      brain_state = min(max_brains, brain_state+num_brains)
      shotgun_state += num_shotguns


      if shotgun_state > max_shotguns:
        if verbose:print("Went bust :(")
        break



  if shotgun_state <= max_shotguns:
    reward = brain_state
  else:
    reward = 0

  q_stay = np.where(this_episode_visits_stay,q_stay + 1/(visits_stay+1)*(reward - q_stay),q_stay)
  visits_stay = np.where(this_episode_visits_stay,visits_stay+1,visits_stay)

  q_roll = np.where(this_episode_visits_roll,q_roll+ 1/(visits_roll+1)*(reward - q_roll),q_roll)
  visits_roll = np.where(this_episode_visits_roll,visits_roll+1,visits_roll)

  if verbose:print(f"{q_stay=}")
  if verbose:print(f"{q_roll=}")


print("")
print(f"q_stay:\n {q_stay}")
print(f"visits_stay: \n {visits_stay}")
print(f"q_roll:\n {q_roll}")
print(f"visits_roll:\n {visits_roll}")

print(f"policy: \n {1.0*policy}")


100%|██████████| 1000/1000 [00:01<00:00, 535.21it/s]


q_stay:
 [[0. 0. 0.]
 [1. 1. 1.]
 [2. 2. 2.]
 [3. 3. 3.]
 [4. 4. 4.]
 [5. 5. 5.]
 [6. 6. 6.]
 [7. 7. 7.]
 [8. 8. 8.]
 [9. 9. 9.]]
visits_stay: 
 [[  2   1   2]
 [  1   3 172]
 [  1   6 169]
 [  1  45 173]
 [  1   7 156]
 [  1  76 106]
 [  3  35  88]
 [  2  20  54]
 [  1  11  47]
 [  9  23  49]]
q_roll:
 [[2.24 1.46 0.69]
 [3.09 2.14 0.89]
 [3.72 2.76 1.18]
 [4.52 3.44 1.49]
 [5.25 4.65 1.73]
 [5.57 4.62 2.19]
 [6.46 5.07 2.93]
 [7.86 6.55 2.55]
 [8.57 6.27 2.47]
 [8.   7.04 1.84]]
visits_roll:
 [[1000  111   98]
 [ 116  267  188]
 [ 163  214  169]
 [  96  132  173]
 [  32  110  156]
 [  30   77  106]
 [  13   40   88]
 [   7   29   55]
 [   7   11   47]
 [   9   23   49]]
policy: 
 [[1. 1. 1.]
 [1. 1. 0.]
 [1. 1. 0.]
 [1. 1. 0.]
 [1. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 0.]]





In [None]:
print(policy)

[[ True  True  True]
 [ True  True False]
 [ True  True False]
 [ True  True False]
 [ True  True False]
 [ True False False]
 [ True False False]
 [ True False False]
 [ True False False]
 [False False False]]


1. No policy change
2. Greedy policy improvment -> Gets stuck!
3. "Optimism in the face of uncertainty"
4. Alpha improvements = Slowly delete history
5. Epsilon Greedy Improvements

In [None]:
#Policy improvement!

# Setup number of episodes and maximum length of an episode
num_episodes = 10_000
t_max = 10

# Setup the q functions now
q_stay = np.zeros((max_brains+1,max_shotguns+1))
q_roll = np.zeros((max_brains+1,max_shotguns+1))

#note that now the visits depend on the action you took.
visits_stay = np.zeros((max_brains+1,max_shotguns+1),dtype=int)
visits_roll = np.zeros((max_brains+1,max_shotguns+1),dtype=int)

#policy! fixed for now.
policy = np.zeros((max_brains+1,max_shotguns+1),dtype=int) #where to roll again and where to stop.
brains,shotguns = np.indices((max_brains+1,max_shotguns+1))
policy[brains,shotguns] = (brains < 5) #reroll if brains<5 otherwise stop

#Main Monte Carlo Loop
verbose = False
for episode in tqdm(range(num_episodes)):
  if (episode+1) % 10 == 0:
    policy = 1*(q_roll >= q_stay)
  #Initialize Episode
  if verbose: print(f"Episode #{episode}...")
  this_episode_visits_stay = np.zeros((max_brains+1,max_shotguns+1),dtype=int)
  this_episode_visits_roll = np.zeros((max_brains+1,max_shotguns+1),dtype=int)
  my_history = dice_roll_history()
  brain_state = int(0) #starting state of the number of brains, shotguns we've seen.
  shotgun_state = int(0)

  #Loop over times t
  for t in range(t_max):
    if verbose: print(f".{t=}, Brains:{brain_state}, Shotguns:{shotgun_state}")

    #get the action from the policy
    action = policy[brain_state, shotgun_state]

    epsilon = 5.0/np.sqrt(episode+1)
    if np.random.random() < epsilon:
      action = np.random.randint(2) #random choice sometimes!



    if action == 1:
      if verbose: print(f"..Chose to Reroll! Roll={num_to_words(my_history[t])}")
      this_episode_visits_roll[brain_state,shotgun_state] += 1

      #chose to roll again!
      num_brains = np.sum(my_history[t] == 0)
      num_shotguns = np.sum(my_history[t] == 1)
      brain_state = min(max_brains, brain_state+num_brains)
      shotgun_state += num_shotguns

      if shotgun_state > max_shotguns:
        if verbose:print("Went bust :(")
        break

    elif action == 0:
      this_episode_visits_stay[brain_state,shotgun_state] += 1
      if verbose:print(f"..Chose to Stop! Final Brains={brain_state}")
      #chose to stop
      break

  if shotgun_state <= max_shotguns:
    reward = brain_state
  else:
    reward = 0

  alpha = 0.05
  q_stay = np.where(this_episode_visits_stay,q_stay + np.maximum(1/(visits_stay+1),alpha)*(reward - q_stay),q_stay)
  visits_stay = np.where(this_episode_visits_stay,visits_stay+1,visits_stay)

  q_roll = np.where(this_episode_visits_roll,q_roll+ np.maximum(1/(visits_roll+1),alpha)*(reward - q_roll),q_roll)
  visits_roll = np.where(this_episode_visits_roll,visits_roll+1,visits_roll)

  if verbose:print(f"{q_stay=}")
  if verbose:print(f"{q_roll=}")

print("")
print(f"q_stay:\n {q_stay}")
print(f"visits_stay: \n {visits_stay}")
print(f"q_roll:\n {q_roll}")
print(f"visits_roll:\n {visits_roll}")

print(f"policy:\n {policy}")


100%|██████████| 10000/10000 [00:17<00:00, 565.37it/s]


q_stay:
 [[0. 0. 0.]
 [1. 1. 1.]
 [2. 2. 2.]
 [3. 3. 3.]
 [4. 4. 4.]
 [5. 5. 5.]
 [6. 6. 6.]
 [0. 7. 0.]
 [0. 0. 0.]
 [0. 0. 9.]]
visits_stay: 
 [[ 494   70   37]
 [  61  109 1509]
 [ 132  732 1158]
 [  51  850  894]
 [  50  732  375]
 [ 167  263   80]
 [  67   44    7]
 [   0   13    0]
 [   0    0    0]
 [   0    0    8]]
q_roll:
 [[2.66 1.3  0.62]
 [2.53 2.   0.77]
 [3.47 1.98 1.83]
 [4.41 2.99 1.14]
 [5.02 3.67 0.8 ]
 [4.89 4.54 0.  ]
 [5.43 2.5  0.  ]
 [4.75 0.   3.6 ]
 [9.   1.8  0.  ]
 [6.75 0.   0.  ]]
visits_roll:
 [[9524  885  917]
 [1220 2257   98]
 [1444 1379   45]
 [ 818  270   42]
 [ 286   31   15]
 [  18   13    4]
 [   7   12    1]
 [  12    2   10]
 [   2    5    8]
 [   4    4    1]]
policy:
 [[1 1 1]
 [1 1 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [0 0 0]
 [0 0 0]
 [1 0 1]
 [1 1 1]
 [1 1 0]]



