# Dyna Maze
- Actions: {up, down, right, left} (Deterministic)
- States: Maze Cells
- Rewards: [0, 1]
- Environment: Deterministic

In [22]:
import random 
import math
import matplotlib.pyplot as plt
import pprint

In [23]:
# Episodes 
episodes = 50

# Start
start = (2, 0)

# Goal
goal = (0, 8)

# alpha 
alpha = 0.1

# n-planning
n = [0, 5, 50]

# gamma 
gamma = 0.95

In [24]:
def create():
    """
    Create and return Q and arbitrary policy
    """
    
    # Create length and width to iterate grid
    length = 9; width = 6

    # Set of equiprobable actions N, S, E, W or Up, Down, Right, Left
    actions = [(-1, 0), (1, 0), (0, 1), (0, -1)]
    
    # Policy
    policy = {}; Q = {}

    # Create states for policy
    for i in range(length*width):

        # Create row index
        row = i // length

        # Create column index
        column = i % length

        # Create policy
        policy[(row, column)] = actions
        
        # Create Q(s,a)
        Q[(row, column)] = [0, 0, 0, 0]
        
    # Q and policy 
    return Q, policy

In [25]:
def boundary(state):
    """
    Returns if agent is in bounds
    """
    
    # row boundary
    row_boundary = (state[0] < 0 or state[0] > 5) 
        
    # column boundary
    column_boundary = (state[1] < 0 or state[1] > 8) 
    
    # Obstacle
    obstacle = state in [(1, 2), (2, 2), (3, 2), (4, 5), (0, 7), (1, 7), (2, 7)]
    
    # if not in bounds
    return row_boundary or column_boundary or obstacle

In [26]:
def argmax(Q, s):
    """
    Return the argmax Q(S',a)
    """
    
    # Keep track of ties
    ties = []
    
    # loop through Q(S', a)
    for a in Q[s]:
        
        # Check if tie 
        if a == max(Q[s]):
            
            # tie found
            ties.append(a)
            
    return random.choice(ties)

In [27]:
def e_greedy(Q, policy, state, goal):
    """
    Return an action based on e-greedy policy
    """

    # e-greedy action
    action = argmax(Q, state)
    
    # Exploration
    if random.random() < 0.1:
        
        # Explorative action
        action = random.choice(policy[state])
    
    # e-greedy action
    return action

In [28]:
def move(state, action, goal):
    """
    Return environment obervation
    """
    
    # S' after a
    s_prime = (state[0] + action[0], state[1] + action[1])

    # Check goal reached
    if s_prime == goal:
        
        # S', R = 1 on termination
        return s_prime, 1
    
    # check if out of bounds
    if boundary(s_prime):
        
        # Remain in state
        s_prime = state
    
    # S', R'
    return s_prime, 0

In [29]:
def environment(model, Q, policy, alpha, gamma, start, goal):
    """
    Observation of 'real' experience and step counts returned by environment
    """
    
    # Initialize steps and counts
    N = {state: 0 for state in policy.keys()}; visits = 0
    
    # S current (nonterminal) state
    S = start

    # One step planning for model
    while S != goal:
        
        # Increment counts
        N[S] += 1; visits += 1
        
        # A "-greedy(S, Q)
        A = e_greedy(Q, policy, S, goal)

        # Take action A; observe resultant reward, R, and state, S'
        s_prime, R = move(S, A, goal)

        # Q(S, A) <-- Q(S, A) + a[R + gamma*max_a Q(S',a) - Q(S, A)]
        Q[S][policy[S].index(A)] = Q[S][policy[S].index(A)] + alpha*((R + gamma*max(Q[s_prime])) - Q[S][policy[S].index(A)]) 

        # Model(S, A) <-- R, S' (assuming deterministic environment)
        model[(S, A)] = (R, s_prime)

        # S <-- s
        S = s_prime
    
    # return Model(s, a)
    return Q, model, N, visits

In [30]:
def dyna(alpha, n, episodes, start, goal):
    """
    Return Tabular Dyna-Q
    """
    
    # Initialize Q(s, a) and Model(s, a) for all s of S and a of A(s)
    Q, policy = create(); model = {}
    
    # Loop forever:
    for _ in range(episodes):
        
        # Environment
        Q, model, N, visits = environment(model, Q, policy, alpha, gamma, start, goal)
        
        # Loop repeat n times:
        for i in range(n):
            
            # S <-- random  previously observed state and a random action A previously taken in S
            S, A = random.choice(list(model.keys()))
            
            # R, S' <-- Model(S, A)
            R, s_prime = model[(S,A)]
            
            # Q(S, A) <-- Q(S, A) + a[R + gamma*max_a Q(S',a) - Q(S, A)]
            Q[S][policy[S].index(A)] = Q[S][policy[S].index(A)] + alpha*((R + gamma*max(Q[s_prime])) - Q[S][policy[S].index(A)]) 
    
    return Q

In [31]:
# create terminal printer instance
pp = pprint.PrettyPrinter(width=160, compact=True)

# Run Dyna-Q
for i in range(len(n)):
    
    # Show agents
    print(f"Agent {i+1}")
    pp.pprint(dyna(alpha, n[i], episodes, start, goal))
    print()

Agent 1


TypeError: 'int' object is not subscriptable