In [2]:
from IPython.core.display import *
from StringIO import StringIO
from operator import *
import copy
import itertools
import random

In [3]:
def read_world( filename):
    with open( filename, 'r') as f:
        world_data = [x for x in f.readlines()]
    f.closed
    world = []
    for line in world_data:
        line = line.strip()
        if line == "": continue
        world.append([x for x in line])
    return world

Next we create a dict of movement costs. Note that we've negated them this time because RL requires negative costs and positive rewards:

In [4]:
# do not reference this as a global variable.
costs = { '.': -1, '*': -3, '^': -5, '~': -7}
costs

{'*': -3, '.': -1, '^': -5, '~': -7}

and a list of offsets for NEIGHBORS. You'll need to work this into your actions, A, parameter.

In [5]:
NEIGHBORS = [(0,-1), (1,0), (0,1), (-1,0)]

### function argmax is used to find the action that have maximum expected value for a particular state.

In [6]:
def argmax(actions, func):
    max_action = actions[0]; max_value = func(max_action)
    for a in actions:
        value = func(a)
        if value > max_value:
            max_action, max_value = a, value
    return max_action

### function get_best_policy is used to get the best policy for an action at a particular state.

In [7]:
def get_best_policy(states, actions, q):
    pi = {}
    for s in states:
        pi[s] = argmax(actions, lambda a:get_utility(s, a, q))
    return pi

In [8]:
def get_utility(s, a, q):
    return q[(s,a)]

### q_learning function uses Q-Learning (reinforcement learning) to train the agent. It returns policies (the agent should move left, right, up, or down at diffirent states) after learning the world which is a grid map.

In [9]:
def q_learning(world, costs, goal, reward, actions, gamma, alpha, max_episode):
    states = set()
    rewards = {}
    for x in range(len(world[0])):
        for y in range(len(world)):
            if world[y][x] in costs:
                rewards[x, y] = costs[world[y][x]]
                states.add((x,y))
            else:
                rewards[x, y] = None
    rewards[goal] = reward
    def get_state(state, action):
        state1 = tuple(map(add, state, action))
        if (state1 in states):
            return state1#move to new state
        else:
            return state#bound back
    def get_actions(state):
        if state == goal:
            return [None]
        else:
            return actions
    def get_greedy_action(state):
        rand = random.choice([0,1,2,3,4,5,6,7,8,9])#10% greedy
        if rand == 0:
            return argmax(actions, lambda a:get_utility(state, a, q))
        else:
            return random.choice(actions)
    def get_init_state(goal):
        for a in actions:
            state1 = tuple(map(add, goal, a))
            if state1 in states:
                return state1
        return (0, 0)
    checkpoint = get_init_state(goal)
    def update_checkpoint(checkpoint, state):
        dx1 = abs(state[0]-goal[0])
        dy1 = abs(state[1]-goal[1])
        d1 = dx1+dy1
        dx2 = abs(checkpoint[0]-goal[0])
        dy2 = abs(checkpoint[1]-goal[1])
        d2 = dx2+dy2
        if (d1 > d2):
            checkpoint = state
    q = {}
    for s in states:
        for a in actions:
            q[(s,a)] = 0
    for e in range(max_episode):
        s = checkpoint
        while (s != goal):
            a = get_greedy_action(s)#explore, can also use exploit greedy
            s1 = get_state(s, a)
            r = rewards[s1]
            q[(s,a)] = (1-alpha) * q[(s,a)] + alpha * (r + gamma * max([q[(s1,a1)] for a1 in get_actions(s)]))
            s = s1
            update_checkpoint(checkpoint, s)
    return get_best_policy(states, actions, q)

### print_policy prints out the 2d policy map with steps up '^', down 'v', left '<', and right '>'

In [10]:
def print_policy(world, policy):
    moves = {(0,-1):'^', (1,0):'>', (0,1):'v', (-1,0):'<'}
    for y in range(len(world)):
        str = ""
        for x in range(len(world[0])):
            if (x, y) in policy:
                str += moves[policy[(x,y)]]
            else:
                str += 'x'
        print str

In [11]:
def print_world(world):
    for y in range(len(world)):
        str = ""
        for x in range(len(world[0])):
            str += world[y][x]
        print str

### function value_iteration uses value iteration to solve the MDP grid map. It returns V and Pi, V is the largest expected discounted reward for particular states. It is used as a reference for Q-Learning.

In [12]:
world1 = read_world("world1.txt")

In [13]:
policy = q_learning(world1, costs, (3,0), 100, NEIGHBORS, 0.9, 0.1, 100)

In [14]:
print_world(world1)

....
.x..
....


In [15]:
print_policy(world1, policy)

>>>^
vx>^
>>>^


In [16]:
world2 = read_world("world2.txt")

In [17]:
policy = q_learning(world2, costs, (6,6), 100, NEIGHBORS, 0.9, 0.1, 100)

In [18]:
print_world(world2)

.******
.******
.******
.......
******.
******.
******.


In [19]:
print_policy(world2, policy)

^<<<^vv
<<vvvvv
^<vv>vv
^>>>>>v
v^>>>>v
>>>>>>v
v>>>>>^


In [20]:
world3 = read_world("world3.txt")

In [21]:
print_world(world3)

.....**********............
.......*********..xxxxxxx..
....xx***********xxx^^^xx^^
....^xxx****~~*****..^^xx^.
...^^xx**..~~~~***...^xxx^.
.^^^xx^^....~~~~~.....^x^..
.^^xx^^....^xxx~~~.....^...
..^^^^^......^xxx~~~..^^^..
...^^^......^^xx.~~..^^^...
...~~~..^^^xxxx...~.^^^....
..~~~~~.^^xxx^.....^xxx^...
.~~~~~..^xx^....~~..^xx^...
~~~~~..^^xx^.~~~~...^x^....
.~~~~..^**^....~~~~..^.....
....x..****^^^^.~~~..^x^...
...xxx******xxx^^.~.^xx^...
..xx**********xxx..xxx.....
...xx***********xxxx.......
...xxx********...^^........
....xxx******..........~~~~
..^^^^xx*****.x.....~~~~~~~
....^^^xxx**xx......~~~~~~~
......^^^xxxx....^^..~~~~~~
.^^..^^^^^.....^^xx^^.~~~~~
^x^^^^.....xxx^^xx.xx^^~~~~
^xxx^.....^^xxxx^^^^xxx~~~~
^^..........^^^^^....^^^...


In [22]:
policy = q_learning(world3, costs, (26,26), 100, NEIGHBORS, 0.9, 0.1, 100)

In [23]:
print_policy(world3, policy)

^<<<vvv<<^v^<v>^>v>v^^v>>^^
^v<v>><<^>v<<<^^><xxxxxxx^^
v>><xx<<^>^v>^<>^xxxvvvxx^^
>^<^<xxxvvv<<>>v<>vvv<<xx>v
>^^<^xxvvvvv^>^><>><<<xxx>v
<^^^xx^v><><<v<^>>^<^v<x>v<
vv>xx<>v>v<vxxx^^^^>^^<>>><
>vv>^<>>>v>><<xxx<^^^^<^>^<
^vv<>>>^<^>^<^xxv<>>^<<vv^<
v^<<^>>^^^^xxxx<vv>^<^>><^v
^<<<v>v^<^xxx>^^^^<<xxx^v^v
^^^vv>v<<xx>v>>^<^^v<xx>v>>
^^vv>vv^vxxvvvv^>><v<x>>^vv
<vvv>v<<>^>><<<<>^^^<<>^>>^
^v>>x<<<vvv^^^^<<v>^<<x>>v<
<v^xxx^^v<^<xxx^>v>v<xxvv^^
><xx^v>^v<v>vvxxx>>xxxvv<>^
<^<xx<v>v^<^^vvvxxxx>><<^>v
><^xxxvv<><<v>>vvvvv>>v^^>>
v<^<xxx<v<<>>><<vvv^^<<<^^^
<^vvvvxx>^^v>vx^^^<<^^^<^^^
><v<vvvxxx>>xxv^^^<^v<^v><v
<>><v<<<vxxxxv>><<^^<<v<<^v
^^^^^<vvvvv^<^^<^xx^^<<^vvv
^x^^^>>vv>>xxx^>xx<xx^<>v>v
^xxxv>vvv^vvxxxxvvvvxxx>>>v
<>^>v>v>v<<<<<^>>>>>v<>>>>^
