Skip to content

Commit

Permalink
Q-learning
Browse files Browse the repository at this point in the history
  • Loading branch information
Mihai Maruseac committed Apr 9, 2011
1 parent b4466a4 commit 59ca4c2
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 14 deletions.
6 changes: 3 additions & 3 deletions 1.txt
@@ -1,5 +1,5 @@
32 35
8 8
5
6 9
4 4
1
2
4
5 changes: 5 additions & 0 deletions globaldefs.py
Expand Up @@ -30,6 +30,11 @@
TURN_LEFT = 43
TURN_RIGTH = 41

FRONT = 0
LEFT = 1
BACK = 2
RIGHT = 3

IMAGES = {}

def build_sep_points(maxN, maxM):
Expand Down
63 changes: 58 additions & 5 deletions robot.py
Expand Up @@ -3,6 +3,8 @@
# (c) Mihai Maruseac, 341C3 (2011), mihai.maruseac@rosedu.org
#

import random

from globaldefs import *

class Robot(object):
Expand All @@ -25,20 +27,71 @@ def __init__(self, config):
# state, action utility dictionary
self._Q = {}

# decided upon action (when using SARSA)
self.__a__ = None

def step(self, state):
"""
Does a single step (takes an action).
state Current state
"""
# Chose action from state
if self._Q.has_key(state):
act = self._Q[state]
return act[0]
return FORWARD
a = self._choose_action(self._Q[state])
else:
self._Q[state] = {FORWARD:0, TURN_LEFT:0, TURN_RIGTH:0}
a = random.choice(self._Q[state].keys())
print 'From state {0}, chose {1}'.format(state, a)
return a

def receive_reward_and_state(self, olds, news, r):
def receive_reward_and_state(self, olds, a, news, r):
"""
Receive a reward after taking an action from olds state, reaching news
state. Basically, learn (state,action) utilities.
olds Old state
a Action taken in that state
news New state
r Reward given
"""
print 'From state {0}, chose {1}, got to {2} with reward {3}'.\
format(olds, a, news, r)
if not self._Q.has_key(news):
q = 0
elif self._Q_or_SARSA:
# Q learning
d = self._Q[news]
q = max(zip(d.values(), d.keys()))[0]
else:
# SARSA
q = self._Q[news][_choose_action(self._Q[news], True)]
qa = self._Q[olds][a]
self._Q[olds][a] += self._alpha * (r + self._gamma * q - qa)
print 'Now Q is {0}'.format(self._Q)

def _choose_action(self, actions, future=False):
"""
Choose an action from the list of actions and their rewards (a
dictionary to be more exact).
actions Dictionary containing each possible action and its value.
future True if using SARSA and this call is only prospective (in
which case, the same action should be returned on the next
call, which will have future=False).
"""
pass
a = actions.keys()[0]
if self._greedy:
# ε-greedy selection
tmp = random.uniform(0, 1)
if tmp > self._eps_or_tau:
a = max(zip(actions.values(), actions.keys()))[1]
else:
a = random.choice(actions.keys())
if future:
self.__a__ = a
elif self.__a__:
a = self.__a__ #take the generated action
self.__a__ = None
return a

47 changes: 41 additions & 6 deletions world.py
Expand Up @@ -118,7 +118,7 @@ def step(self):

newstate = self._get_state()
reward = self._get_reward(newstate)
self._robot.receive_reward_and_state(state, newstate, reward)
self._robot.receive_reward_and_state(state, act, newstate, reward)

def _get_state(self):
"""
Expand All @@ -128,21 +128,56 @@ def _get_state(self):
# assume that o = ROBOT_N (that is we are facing north)
# compute the real distances
state = [x, y, self._N - x - 1, self._M - y - 1]
print 'Real distances: {0}'.format(state)
# trim to range
for i in xrange(len(state)):
if state[i] > self._D:
state[i] = self._D
print 'Trimmed distances: {0}'.format(state)
# rotate state
state = state[(o-1):] + state[:(o-1)]
state.append(o)
print 'State: {0}'.format(state)
# state.append(o)
return tuple(state)

def _get_reward(self, state):
"""
Returns the reward for a given state.
"""
return -10
a = -10 # small negative value to discourage turnings

# Penalty for being able to leave the grid
p = -1000
if state[FRONT] == 0:
print 'able to leave'
a += p

# Penalty for being between the min value and the borders
p = -5000
if state[RIGHT] < self._d1:
print 'too close'
a += p

# Penalty for being above the max value (toward the center of grid)
p = -5000
if state[RIGHT] > self._d2:
print 'too far'
a += p

# Bonus for being in the correct corridor
p = 100
if self._d1 <= state[RIGHT] <= self._d2:
print 'ok'
a += p

# Penalty for having a wrong orientation
p = -100
if state[RIGHT] > state[FRONT]:
print 'turn'
a += p
if state[RIGHT] > state[FRONT]:
print 'turn'
a += p
if state[RIGHT] > state[FRONT]:
print 'turn'
a += p

return a

0 comments on commit 59ca4c2

Please sign in to comment.