At the moment this notebook contains a copy of the code in Sequential_Menu_Search.ipynb. Your task is to edit this code in such a way that the agent can jump to any position in the menu--that is, it is not restricted to move one item at a time. 

Hint: Number of possible Actions are now equivalent to the number of items in the menu.

In [1]:
from random import random, choice
from scipy import zeros
import numpy as np

from pybrain.utilities import Named
from pybrain.rl.environments.environment import Environment

In [2]:
class Menu(Environment, Named):
    """ 2D mazes, with actions being the direction of movement (N,E,S,W)
    Note that in this example it is a menu represeted as a one dimesional array.
    Hence, the user can traverse only in left (west) or right (east) directions
    The observations can be noisy.
    """

    # table of booleans
    mazeTable = None

    # single goal
    goal = None

    # current state
    perseus = None

    # list of possible initial states
    initPos = None

    # directions
    N = (1, 0)
    S = (-1, 0)
    E = (0, 1)
    W = (0, -1)

    #note that the action space contains only east or west movements
    allActions = [E,W]

    # stochasticity
    stochAction = 0.
    stochObs = 0.

    def __init__(self, topology, goal, **args):
        allActions = range(0,np.size(topology),1)
        print allActions

        self.setArgs(**args)
        self.mazeTable = topology
        self.goal = goal
        if self.initPos == None:
            self.initPos = self._freePos()
            self.initPos.remove(self.goal)
        self.reset()

    def reset(self):
        """ return to initial position (stochastically): """
        self.bang = False
        self.perseus = choice(self.initPos)

    def _freePos(self):
        """ produce a list of the free positions. """
        res = []
        for i, row in enumerate(self.mazeTable):
            for j, p in enumerate(row):
                if p == False:
                    res.append((i, j))
        return res

    def _moveInDir(self, pos, dir):
        """ the new state after the movement in one direction. """
        return (pos[0] + dir[0], pos[1] + dir[1])

    def performAction(self, action):

        if self.stochAction > 0:
            if random() < self.stochAction:
                action = choice(list(range(len(self.allActions))))

        tmp = self._moveInDir(self.perseus, self.allActions[action])
        
        if self.mazeTable[tmp] == False:
            self.perseus = tmp
            self.bang = False
        else:
            self.bang = True

    def getSensors(self):
        obs = zeros(2)
        for i, a in enumerate(Maze.allActions):
            obs[i] = self.mazeTable[self._moveInDir(self.perseus, a)]
        if self.stochObs > 0:
            for i in range(len(obs)):
                if random() < self.stochObs:
                    obs[i] = not obs[i]
        return obs

    def __str__(self):
        """ Ascii representation of the maze, with the current state """
        s = ''
        for r, row in reversed(list(enumerate(self.mazeTable))):
            for c, p in enumerate(row):
                if (r, c) == self.goal:
                    s += '*'
                elif (r, c) == self.perseus:
                    s += '@'
                elif p == True:
                    s += '#'
                else:
                    s += ' '
            s += '\n'
        return s



In [3]:
from pybrain.rl.environments import Task
from scipy import array

In [4]:
class MDPMenuTask(Task):
    """ This is a MDP task for the MenuEnvironment. The state is fully observable,
        giving the agent the current position of perseus. Reward is given on reaching
        the goal, otherwise no reward. """

    def getReward(self):
        """ compute and return the current reward (i.e. corresponding to the last action performed)
        Note that now we give a negative reward for visiting irrelevant menu items."""
        if self.env.goal == self.env.perseus:
            self.env.reset()
            reward = 1.
        else:
            reward = -1.
        return reward

    def performAction(self, action):
        """ The action vector is stripped and the only element is cast to integer and given
            to the super class.
        """
        Task.performAction(self, int(action[0]))


    def getObservation(self):
        """ The agent receives its position in the menu, to make this a fully observable
            MDP problem.
        """
        obs = array([self.env.perseus[0] * self.env.mazeTable.shape[0] + self.env.perseus[1]])
        return obs

In [5]:
from scipy import *

In [6]:
from pybrain.rl.learners.valuebased import ActionValueTable
from pybrain.rl.agents import LearningAgent
from pybrain.rl.learners import Q
from pybrain.rl.experiments import Experiment

In [7]:
import pylab
pylab.gray()
pylab.ion()

In [None]:
# create the menu with 7 items
matrix_size = 7
envmatrix = array([[0,0,0,0,0,0,1]])

env = Menu(envmatrix,(0,2))

# create task
task = MDPMenuTask(env)



# create value table and initialize with ones
table = ActionValueTable(matrix_size, 2)
table.initialize(1.)


# create agent with controller and learner - use SARSA(), Q() or QLambda() here
learner = Q()

# standard exploration is e-greedy, but a different type can be chosen as well, such as:
# learner.explorer = BoltzmannExplorer()

# create agent
agent = LearningAgent(table, learner)

# create experiment
experiment = Experiment(task, agent)

# prepare plotting
pylab.gray()
pylab.ion()

for i in range(100):
    # interact with the environment (here in batch mode)
    experiment.doInteractions(matrix_size)
    agent.learn()
    agent.reset()
    # and draw the table
    pylab.pcolor(table.params.reshape(matrix_size,2).max(1).reshape(matrix_size,1))
    pylab.draw()
    pylab.ion()
    pylab.show()
print "training complete"
