Q-learning

mihaimaruseac · Apr 9, 2011 · 59ca4c2 · 59ca4c2
1 parent b4466a4
commit 59ca4c2
Show file tree

Hide file tree

Showing 4 changed files with 107 additions and 14 deletions.
diff --git a/1.txt b/1.txt
@@ -1,5 +1,5 @@
-32 35
+8 8
 5
-6 9
+4 4
+1
 2
-4
diff --git a/globaldefs.py b/globaldefs.py
@@ -30,6 +30,11 @@
 TURN_LEFT = 43
 TURN_RIGTH = 41
 
+FRONT = 0
+LEFT = 1
+BACK = 2
+RIGHT = 3
+
 IMAGES = {}
 
 def build_sep_points(maxN, maxM):

diff --git a/robot.py b/robot.py
@@ -3,6 +3,8 @@
 # (c) Mihai Maruseac, 341C3 (2011), mihai.maruseac@rosedu.org
 #
 
+import random
+
 from globaldefs import *
 
 class Robot(object):
@@ -25,20 +27,71 @@ def __init__(self, config):
         # state, action utility dictionary
         self._Q = {}
 
+        # decided upon action (when using SARSA)
+        self.__a__ = None
+
     def step(self, state):
         """
         Does a single step (takes an action).
+
+        state   Current state
         """
         # Chose action from state
         if self._Q.has_key(state):
-            act = self._Q[state]
-            return act[0]
-        return FORWARD
+            a = self._choose_action(self._Q[state])
+        else:
+            self._Q[state] = {FORWARD:0, TURN_LEFT:0, TURN_RIGTH:0}
+            a = random.choice(self._Q[state].keys())
+        print 'From state {0}, chose {1}'.format(state, a)
+        return a
 
-    def receive_reward_and_state(self, olds, news, r):
+    def receive_reward_and_state(self, olds, a, news, r):
         """
         Receive a reward after taking an action from olds state, reaching news
         state. Basically, learn (state,action) utilities.
+
+        olds    Old state
+        a       Action taken in that state
+        news    New state
+        r       Reward given
+        """
+        print 'From state {0}, chose {1}, got to {2} with reward {3}'.\
+            format(olds, a, news, r)
+        if not self._Q.has_key(news):
+            q = 0
+        elif self._Q_or_SARSA:
+            # Q learning
+            d = self._Q[news]
+            q = max(zip(d.values(), d.keys()))[0]
+        else:
+            # SARSA
+            q = self._Q[news][_choose_action(self._Q[news], True)]
+        qa = self._Q[olds][a]
+        self._Q[olds][a] += self._alpha * (r + self._gamma * q - qa)
+        print 'Now Q is {0}'.format(self._Q)
+
+    def _choose_action(self, actions, future=False):
+        """
+        Choose an action from the list of actions and their rewards (a
+        dictionary to be more exact).
+
+        actions Dictionary containing each possible action and its value.
+        future  True if using SARSA and this call is only prospective (in
+                which case, the same action should be returned on the next
+                call, which will have future=False).
         """
-        pass
+        a = actions.keys()[0]
+        if self._greedy:
+            # ε-greedy selection
+            tmp = random.uniform(0, 1)
+            if tmp > self._eps_or_tau:
+                a = max(zip(actions.values(), actions.keys()))[1]
+            else:
+                a = random.choice(actions.keys())
+        if future:
+            self.__a__ = a
+        elif self.__a__:
+            a = self.__a__ #take the generated action
+            self.__a__ = None
+        return a
 
diff --git a/world.py b/world.py
@@ -118,7 +118,7 @@ def step(self):
 
         newstate = self._get_state()
         reward = self._get_reward(newstate)
-        self._robot.receive_reward_and_state(state, newstate, reward)
+        self._robot.receive_reward_and_state(state, act, newstate, reward)
 
     def _get_state(self):
         """
@@ -128,21 +128,56 @@ def _get_state(self):
         # assume that o = ROBOT_N (that is we are facing north)
         # compute the real distances
         state = [x, y, self._N - x - 1, self._M - y - 1]
-        print 'Real distances: {0}'.format(state)
         # trim to range
         for i in xrange(len(state)):
             if state[i] > self._D:
                 state[i] = self._D
-        print 'Trimmed distances: {0}'.format(state)
         # rotate state
         state = state[(o-1):] + state[:(o-1)]
-        state.append(o)
-        print 'State: {0}'.format(state)
+#        state.append(o)
         return tuple(state)
 
     def _get_reward(self, state):
         """
         Returns the reward for a given state.
         """
-        return -10
+        a = -10 # small negative value to discourage turnings
+
+        # Penalty for being able to leave the grid
+        p = -1000
+        if state[FRONT] == 0:
+            print 'able to leave'
+            a += p
+
+        # Penalty for being between the min value and the borders
+        p = -5000
+        if state[RIGHT] < self._d1:
+            print 'too close'
+            a += p
+
+        # Penalty for being above the max value (toward the center of grid)
+        p = -5000
+        if state[RIGHT] > self._d2:
+            print 'too far'
+            a += p
+
+        # Bonus for being in the correct corridor
+        p = 100
+        if self._d1 <= state[RIGHT] <= self._d2:
+            print 'ok'
+            a += p
+
+        # Penalty for having a wrong orientation
+        p = -100
+        if state[RIGHT] > state[FRONT]:
+            print 'turn'
+            a += p
+        if state[RIGHT] > state[FRONT]:
+            print 'turn'
+            a += p
+        if state[RIGHT] > state[FRONT]:
+            print 'turn'
+            a += p
+
+        return a