### Q-Planning

In [3]:
import numpy as np
import matplotlib.pyplot as plt


### Tabular Dyna-Q

In [None]:
UNOBSERVED = (-1,-1,)
# Initialize Q(s, a) and Model(s, a) for all s ∈ S and a ∈ A(s)
def DynaQ(Q, Model, Sequence, n, alpha, gamma)
    # Loop forever:
    while True
        # (a) S ← current (nonterminal) state
        # (b) A ← ε-greedy(S, Q)
        # (c) Take action A; observe resultant reward, R, and state, S`
        state, is_terminal, next_state, action, reward = next(Sequence)
        # (d) Q(S, A) ← Q(S, A) + α*[R + γ*max(a)Q(S`, a) - Q(S, A)]
        Q[state][action] = Q[state][action] + alpha*( reward + gamma*max(Q[state]) - Q[state][action])
        # (e) Model(S, A) ← R, S` (assuming deterministic environment)
        Model[state][action] = (reward, next_state)
        # (f) Loop repeat n times:
        observed = np.transpose(np.nonzero(Model[...,1] == UNOBSERVED))
        for sample in np.random.randint(len(observed), size=n):
            #   S ← random previously observed state
            #   A ← random action previously taken in S
            SA = observed[sample]
            ### S = SA[:-1]
            ### A = SA[-1]
            #   R, S` ← Model(S, A)
            R, S1 = Model[SA]
            #   Q(S, A) ← Q(S, A) + α*[R + γ*max(a)Q(S`, a) - Q(S, A)]
            Q[SA] = Q[SA] + alpha*( reward + gamma*max(Q[S1]) - Q[SA])

In [None]:
maze_shape = (6,9,)
maze_obstacles_1 = [[2,1,3,9]]
maze_obstacles_2 = [[2,1,3,8]]
OBSTACLE_TAG = 0
START_TAG = 2
start_cell = (0,3)
TARGET_TAG = 3
target_cell = (5,8)
REWARD = -1

def buildMaze(shape, obstacles):
    maze = np.ones(shape, dtype = np.uint8)
    for begin_row, begin_column, end_row, end_column in obstacles:
        for r in range(begin_row, end_row):
            for c in range(begin_column, end_column):
                maze[r,c] = OBSTACLE_TAG
    return maze

maze1 = buildMaze(maze_shape, maze_obstacles_1)
maze2 = buildMaze(maze_shape, maze_obstacles_2)


In [None]:
maze2

### Setup

In [None]:
# Actions legend: Up, Right, Down, Left
ACTIONS = [[1,0], [0,1], [-1,0], [0,-1]]
ACTIONS_NUM = len(ACTIONS)
NO_REWARD = 0
REWARD = 1
shape = maze_shape + (ACTIONS_NUM,)
Q = np.zeros(shape)
Model = np.full(shape, (0,UNOBSERVED,))

#def mazeGetStartState():
#    return start_cell

def maze_getTransition(maze, state, action):
    next_state = tuple(list(state) + ACTIONS[action])
    if not (next_state in np.ndindex(maze.shape) and maze[next_state] > 0):
        return (False, NO_REWARD, state)
    elif next_state == target_cell:
        return (True, REWARD, start_cell)

    return (False, NO_REWARD, next_state)
    

In [None]:
class SequenceGenerator:
    def __init__(self, getAction, getStartState, getTransition, episode_imax=1):
        self.episode_imax = episode_imax
        #self.episode_i=1
        self.get_action = getAction
        self.get_start_state = getStartState
        #self.state = self.get_start_state(self.episode_i)
        self.get_transition = getTransition

    def __iter__(self):
        self.episode_i=1
        self.state = self.get_start_state(self.episode_i)
        return self

    def __next__(self):
        if self.episode_imax > 0 and self.episode_i > self.episode_imax:
            raise StopIteration

        action = self.get_action(self.state, self.episode_i)
        keep_state = self.state
        is_terminal, self.state, reward = self.get_transition(keep_state, action)
        self.episode_i += int(is_terminal)
        return keep_state, is_terminal, self.state, action, reward
    
REWARD_I = 4
STATE_I = 0

class EpsilonGreedyPolicy:
    def __init__(self, Q, Epsilon=0.1):
        self.Q = Q;
        self.epsilon = Epsilon
        
    def __call__(self, state, episode_i=1):
        q = self.Q[state]
        if np.random.rand(1)[0] < self.epsilon:
            return np.random.randint(0,len(q))
        return np.argmax(q)


In [34]:
x = np.array([[(1,66),(77,-33)],
              [(-3,2),(77,9)],
              [(0,31),(1,45)]])
#np.transpose(np.nonzero(x[...,0] == 1))
np.nonzero(x[...,0] == 1)
#x[...,1]

(array([0, 2]), array([0, 1]))

In [38]:
np.transpose(([4,6,2,-1],[90,93,556,73],[-23,-9,56,-1]))

array([[  4,  90, -23],
       [  6,  93,  -9],
       [  2, 556,  56],
       [ -1,  73,  -1]])