# Random-sample one-step tabular Q-planning
### Loop forever:
   1. Select a state, S $\in$ S, and an action, A $\in$ A(S), at random
   2. Send S, A to a sample model, and obtain a sample next reward, R, and a sample next state, S0
   3. Apply one-step tabular Q-learning to S, A, R, S':
####     Q(S, A) $\gets$ Q(S, A) + $\alpha$[R + $\gamma$$max_{a}$ Q(S', a)  Q(S, A)]

In [106]:
import pprint
import random

# Create terminal printer instance
pp = pprint.PrettyPrinter(width=160, compact=True)

# Q - plan with respect to Q
Q = {(i//6+1, i%6+1): [0]*2 for i in range(36)}

## Chō-han Bakuchi
- Two standard six-sided dice are shaken in a bamboo cup by the dealer. 
- The cup is then overturned onto the floor.
- Players then place their wagers on whether the sum total of numbers showing on the two dice will be "Chō" (even) or "Han" (odd).
- The dealer then removes the cup, displaying the dice.
- The winners collect their money.

In [107]:
def cho_han_sample(state, action):
    """
    Return S',R from sample model of cho han
    """
    
    # bet was incorrect and player loses
    reward = -1
    
    # Check if sum of dice is even and if decision matches state then reward == 1
    if sum(state) % 2 == 0 and action % 2 == 0: reward = 1
        
    # Check if sum of dice is even and if decision matches state then reward == 1
    elif sum(state) % 2 != 0 and action % 2 != 0: reward = 1
    
    return reward

In [108]:
def one_step_q_planning(alpha, gamma, runs, Q):
    """
    Return random one-step sample Q-planning
    """
    
    # Loop forever:
    for i in range(runs):
        
        # Select a state and an action at random
        state = random.choice(list(Q.keys())); action = random.choice((0, 1))
        
        # Send S, A to a sample model, and obtain a sample next reward, R, and a sample next state, S'
        s_prime = random.choice(list(Q.keys())); reward = cho_han_sample(state, action)

        # Q(S, A) <-- Q(S, A) + a[R + gamma*max_aQ(S', a) - Q(S, A)]
        Q[state][action] = Q[state][action] + (alpha**i)*(reward + gamma*max(Q[s_prime]) - Q[state][action]) 
    
    # Return Q
    return Q

In [109]:
# Show Q
pp.pprint(one_step_q_planning(0.2, 0.9, 10000, Q))

{(1, 1): [6.4903710731685736e-77, -7.205759403792816e-40],
 (1, 2): [-1.8788340662191044e-254, 8.933531975680016e-24],
 (1, 3): [4.398046511233722e-30, -5.120002726290166e-07],
 (1, 4): [-1.4411518807585633e-40, 2.5600000000000013e-06],
 (1, 5): [4.503599627370509e-37, -1.9342813113834156e-59],
 (1, 6): [-1.5845595994515667e-68, 2.854495385411944e-106],
 (2, 1): [-1.0, 2.3384026197294658e-115],
 (2, 2): [8.388627327352842e-17, -6.5536167772160054e-12],
 (2, 3): [-2.684354560000004e-20, 8.834257939534468e-168],
 (2, 4): [6.871947673600013e-26, -3.546382044571043e-32],
 (2, 5): [-7.922816251426476e-68, 1.8268770466636445e-110],
 (2, 6): [3.277009715199996e-11, -1.8446744073709616e-45],
 (3, 1): [5.629512504580121e-35, -2.0481310719973167e-08],
 (3, 2): [-1.5474250491067328e-61, 7.036874417766418e-33],
 (3, 3): [1.180591620725153e-49, -1.7592145512019396e-31],
 (3, 4): [-1.342177280000002e-19, 1.169575454322478e-114],
 (3, 5): [0.008000000000000002, -1.7422457186352182e-96],
 (3, 6): [-1.