In [4]:
import rpyc
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

import time
import utils
from tqdm import tqdm_notebook as tqdm

%load_ext autoreload
%autoreload 2

from environment.system import System
from agent.tabular_q_learning import Agent as T_Agent

Actions:
- 0 (rotation), 1 (other rotation), 2 (move outwards), 3 (move inwards)

In [14]:
import collections

class Environment():
    def __init__(self, field_classifier, reward_classifier, get_reward_function, get_state_function, state_queue_len=10):
        self.system = System(brick_ip='ev3dev.local', get_state_mode='dict')
        self.field_classifier = utils.load_pickle(field_classifier)
        self.reward_classifier = utils.load_pickle(reward_classifier)
        self.opposite_action = {0:1,1:0,2:3,3:2}
        
        self.on_field = True
        self.border_count = 0
#         self.color_on = color_on
        
        self.state_queue = collections.deque(maxlen=state_queue_len)
        self.reward_queue = collections.deque(maxlen=state_queue_len)
        
        self.get_reward_function = get_reward_function
        self.get_state_function = get_state_function
        
        for _ in range(state_queue_len):
            self._new_state()
        
    def reset(self):
        # stop current action
        self.system.reset()
        # Go to initial state

        # return state
#         return self.prepro([self.state])
      
    def go_to_init_state(self):
        self.system.go_to_init_state()
        print('#'*30)
        print('Going to Init')
        print('#'*30)
        time.sleep(5)
    
    @staticmethod
    def _color_from_one_state(s):
        return s[:3]
        
    def _environment_checks(self):
        # access color information from the last measurement 
        # according to self.new_state() ordering
        color = np.array([self._color_from_one_state(s) for s in self.state_queue[-2]]).reshape(1,-1)
        
        if self.field_classifier.predict(color) == [0]:
            print('I am outside')
            border_count += 1

            if self.on_field:
                self.__env.perform_actions([self.opposite_action[action]])
                print('BOUNCIN!!1')
                time.sleep(1)
            self.on_field = False
        else:
            self.on_field = True
    
        if border_count == 3:
            self.go_to_init_state()
            border_count = 0
            
    def _calculate_reward(self):
        # Predict propba
        if not self.on_field:
            return -20
        
        weights = np.ones(shape=(self.state_queue_len,))
        weights = [weight * i for i, weight in enumerate(weights)]
        
        colors = np.array([s[0] for s in self.state_queue]).squeeze()
#         r = (np.argmax(self.reward_classifier.predict_proba(x), axis = 1) == 1).sum()
        # sum the probabilities of black class and compute a function of it
        black_proba = self.reward_classifier.predict_proba(colors[-2:])[:,1]
        black_proba_weighted = [weight * p for weight, p in zip(weights, black_proba)]
      
        black_threshold = 0.3
        r = np.max([0, (np.sum(black_proba_weighted)-(black_threshold*self.num_measurements)) * 5])
        self.reward_queue.append(r)  
        

    def _new_state(self):
        s = self.system.get_state()
        color = s['cs'][0]
        top_pos = s['top'][0]
        bot_pos = s['bot'][0]
        self.state_queue.append((*color, top_pos, bot_pos))
        
    def _cycle(self, action):
        if action:
            self.system.perform_actions([action])
        
        # gets new states and puts it in the queue
        self._new_state()
        
        # environment specific checks like is it still in the field
        self._environment_checks()
        
        # calculate the reward 
        self._calculate_reward()
        
    def step(self, action, free_cycles=5):
        _cycle(action)  
        for _ in range(free_cycles):
            _cycle(None)
        return state, reward, done, {}
    
    @property
    def reward(self):
        return self.get_reward_function(self.reward_queue)
        
    @property
    def state(self):
        return self.get_state_function(self.state_queue)
    
    @property
    def action_space(self):
        return len(self.__env.get_action_space()[0])

def get_reward_function(reward_queue):
    return reward_queue[-1]
    
def get_state_function(state_queue):
    print(state_queue)
    state = list(state_queue)
    s = state[-5:]
    return np.array(s)

In [11]:
env = Environment('./mlp_on_off.pickle','./mlp_white_black.pickle', 
                  state_queue_len = 10, 
                  get_reward_function = get_reward_function,
                  get_state_function = get_state_function)

In [12]:
# env.reset()

In [13]:
env.state

deque([(20, 18, 16, 0, 0), (20, 18, 16, 0, 0), (20, 18, 16, 0, 0), (20, 17, 16, 0, 0), (20, 17, 16, 0, 0), (20, 18, 16, 0, 0), (20, 18, 16, 0, 0), (20, 17, 16, 0, 0), (20, 18, 16, 0, 0), (20, 18, 16, 0, 0)], maxlen=10)


array([[20, 18, 16,  0,  0],
       [20, 18, 16,  0,  0],
       [20, 17, 16,  0,  0],
       [20, 18, 16,  0,  0],
       [20, 18, 16,  0,  0]])

In [9]:
num_episodes = 30

# Make an Agent

q_table = T_Agent(4, learn_rate = .8, gamma =.95)

#create lists to contain total rewards and steps per episode
env.reset()
rewards = []

stop_flag = False
for i in range(num_episodes):
    # Decay the exploration
    q_table.explore_decay = i
    
    s = env.go_to_init_state()
    rAll = 0
    
    d = False
    j = 0
    #The Q-Table learning algorithm
    try:
        while j < 99:
            j+=1

            #Choose an action by greedily (with noise) picking from Q table
            a = q_table.next_action(s)
            print('Action',a)
            #Get new state and reward from environment
            s1,r,d,_ = env.step(a)
            print('\r   ', r)

            #Update Q-Table with new knowledge
            q_table.update(r, s1)

            rAll += r
            s = s1
            if d == True:
                break
    except KeyboardInterrupt:
#         stop_flag = True
        env.reset()
        break

    rewards.append(rAll)
    print('#'*10, 'End Episode', '#'*10)
    
print("Average score over last part " +  str(sum(rewards[-500:])/500))

##############################
Going to Init
##############################
Action 2
I am outside
BOUNCIN!!1
    -20
Action 2
I am outside
BOUNCIN!!1
    6.504167575075455
Action 2
    10.483359129936186
Action 3
    10.485833436968495
Action 1
I am outside
BOUNCIN!!1
    0.0
Action 3
    0.0
Action 1
    10.269726900932097
Action 0
    9.322725240916746
Action 0
    10.46088582212191
Action 1
    10.473609233489391
Action 1
    0.0
Action 3
    3.095531736002787
Action 1
    1.4847248880071218
Action 3
I am outside
BOUNCIN!!1
    -20
Action 2
I am outside
    6.58298959270123
Action 2
    2.4619132366819727
Action 1
I am outside
BOUNCIN!!1
    3.6989835960472894
Action 0
    6.104230455648736
Action 2
    1.3949511052051944
Action 3
    0.0
Action 2
    3.7202256433307443
Action 2
I am outside
BOUNCIN!!1
    0.0
Action 1
    0.0
Action 3
Average score over last part 0.0


In [None]:
start = time.time()
print(env.state)
print(time.time()- start)

In [7]:
env.reset()

(8, 4, 4, 0, 0)

In [None]:
q_table.val_table.shape