# simple value iteration example
tiger problem  
3/20/2018


In [2]:
import numpy as np

In [3]:
def value_iteration(R,T,gamma,nStates,nActions,max_error):
    V = np.zeros([nStates,1])
    Q = np.zeros([nStates,nActions])
    t = 0
    error = V + max_error + 1 # arbitrary to start while loop
    while not all(i < max_error for i in error):
        t = t+1
#         print("TIMESTEP",t)
        V = np.append(V,np.zeros([nStates,1]),axis=1)
        for s in range(nStates): 
            for a in range(nActions):
                Q[s,a] = R[s,a] + gamma*np.dot(T[a,s,:],V[:,t-1])
#             print("Q:",Q)
            V[s,t] = np.amax(Q[s,:]) 
        error = np.abs(V[:,t] - V[:,t-1])
#         print('V_t:',V[:,t])
#         print('error:',error)
    
    # Greedily select policy from V
    policy = np.argmax(Q,axis=1)
    
    return V, policy


In [6]:
# Let's do a cute example: imagine there are 4 states, 2 actions.
# You're starting a business and have to decide whether to save or advertise at each step.
# You can be in one of 4 states: poor and unknown, poor and famous, rich and unknown, and rich and famous. 
# example courtesy of: http://www.cs.cmu.edu/~guestrin/Class/10701-S07/Slides/mdps.pdf

# actions = save, advertise
nActions = 2
actions = ["save", "advertise"]
# states = pu, pf, ru, rf
nStates = 4
states = ["poor and unknown","poor and famous","rich and unknown","rich and famous"]

# gamma: discount factor
gamma = 0.9
# R: rewards in each of four states same for both actions
R = np.array([[0,0],[0,0],[10,10],[10,10]]) 
# T: 4x4x2 matrix of probabilities of transitions between states for the two actions
T = np.array([[[1,0,0,0],[0.5,0,0,0.5],[0.5,0,0.5,0],[0,0,0.5,0.5]],
              [[0.5,0.5,0,0],[0,1,0,0],[0.5,0.5,0,0],[0,1,0,0]]])

In [5]:
def printPolicy(policy,states,actions):
    for i in range(len(policy)):
        print("In state " + states[i] + ", take policy " + actions[policy[i]])


In [10]:
# Define max_error (epsilon) and perform value iteration
max_error = 0.01
V,policy = value_iteration(R,T,gamma,nStates,nActions,max_error)
print("Policy:",policy)

printPolicy(policy,states,actions)

Policy: [1 0 0 0]
In state poor and unknown, take policy advertise
In state poor and famous, take policy save
In state rich and unknown, take policy save
In state rich and famous, take policy save


In [4]:
# actions = save, advertise
nActions = 2
actions = ["take notes", "close eyes"]
# states = pu, pf, ru, rf
nStates = 3
states = ["alert","sleeping","enlightenment"]

# gamma: discount factor
gamma = 0.9
# R: rewards in each of four states same for both actions
R = np.array([[1,1],[5,5],[100,100]]) 
# T: 4x4x2 matrix of probabilities of transitions between states for the two actions
T = np.array([[[0.7,0.1,0.2],[0.8,0.2,0],[0,0,1]],
              [[0.2,0.8,0],[0,1,0],[0,0,1]]])

In [7]:
# Define max_error (epsilon) and perform value iteration
max_error = 0.01
V,policy = value_iteration(R,T,gamma,nStates,nActions,max_error)
print("Policy:",policy)

printPolicy(policy,states,actions)
V

Policy: [0 0 0]
In state alert, take policy take notes
In state sleeping, take policy take notes
In state enlightenment, take policy take notes


array([[  0.        ,   1.        ,  20.08      ,  48.7054    ,
         82.369486  , 118.74380122, 156.01876197, 192.94674446,
        228.69959758, 262.75218402, 294.7959182 , 324.67524222,
        352.34113874, 377.81725235, 401.17532713, 422.51751538,
        441.96374354, 459.6427903 , 475.68608056, 490.22345816,
        503.38039286, 515.27621989, 526.02311658, 535.72559915,
        544.48038096, 552.37647665, 559.49546855, 565.91187484,
        571.69357672, 576.90227411, 581.59394888, 585.81932128,
        589.62429021, 593.05035135, 596.13498986, 598.9120459 ,
        601.41205275, 603.66254776, 605.68835734, 607.5118571 ,
        609.15320882, 610.63057574, 611.96031797, 613.15716939,
        614.23439779, 615.2039496 , 616.07658069, 616.86197433,
        617.56884772, 618.20504799, 618.77763884, 619.2929785 ,
        619.75679007, 620.17422486, 620.54991943, 620.88804698,
        621.19236358, 621.46624986, 621.71274852, 621.93459806,
        622.1342632 , 622.31396225, 622.

In [8]:
# Let's do observable tiger problem

# actions = listen, open-left, open-right
nActions = 3
actions = ["listen", "open-left", "open-right"]
# states = pu, pf, ru, rf
nStates = 2
states = ["tiger-left","tiger-right"]

# gamma: discount factor
gamma = 0.9
# R: rewards 
R = np.array([[0,-100,10],[0,10,-100]]) 
# T: 2x2x3 matrix of probabilities of transitions between states for the three actions
T = np.array([[[1,0],[0,1]],
              [[0.5,0.5],[0.5,0.5]],
              [[0.5,0.5],[0.5,0.5]]])

# Define max_error (epsilon) and perform value iteration
max_error = 0.01
V,policy = value_iteration(R,T,gamma,nStates,nActions,max_error)
print("Policy:",policy)

printPolicy(policy,states,actions)
V

Policy: [2 1]
In state tiger-left, take policy open-right
In state tiger-right, take policy open-left


array([[ 0.        , 10.        , 19.        , 27.1       , 34.39      ,
        40.951     , 46.8559    , 52.17031   , 56.953279  , 61.2579511 ,
        65.13215599, 68.61894039, 71.75704635, 74.58134172, 77.12320755,
        79.41088679, 81.46979811, 83.3228183 , 84.99053647, 86.49148282,
        87.84233454, 89.05810109, 90.15229098, 91.13706188, 92.02335569,
        92.82102012, 93.53891811, 94.1850263 , 94.76652367, 95.2898713 ,
        95.76088417, 96.18479576, 96.56631618, 96.90968456, 97.21871611,
        97.4968445 , 97.74716005, 97.97244404, 98.17519964, 98.35767967,
        98.52191171, 98.66972054, 98.80274848, 98.92247363, 99.03022627,
        99.12720364, 99.21448328, 99.29303495, 99.36373146, 99.42735831,
        99.48462248, 99.53616023, 99.58254421, 99.62428979, 99.66186081,
        99.69567473, 99.72610726, 99.75349653, 99.77814688, 99.80033219,
        99.82029897, 99.83826907, 99.85444217, 99.86899795, 99.88209815,
        99.89388834, 99.9044995 , 99.91404955],
   