In [1]:
import rpyc
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

import time
import utils
from tqdm import tqdm_notebook as tqdm

%load_ext autoreload
%autoreload 2

from environment.system import System
from agent.tabular_q_learning import Agent as T_Agent

Actions:
- 0 (rotation), 1 (other rotation), 2 (move outwards), 3 (move inwards)

In [12]:
import collections

class Environment():
    def __init__(self, field_classifier, reward_classifier, get_reward_function, get_state_function, state_queue_len=10):
        self.system = System(brick_ip='ev3dev.local', get_state_mode='dict')
        self.field_classifier = utils.load_pickle(field_classifier)
        self.reward_classifier = utils.load_pickle(reward_classifier)
        self.opposite_action = {0:3,1:2,2:1,3:0}
        self.opposite_action = {i: self.action_space-i-1 for i in range(self.action_space)}
        
        self.on_field = True
        self.border_count = 0
#         self.color_on = color_on
        
        self.state_queue = collections.deque(maxlen=state_queue_len)
        self.reward_queue = collections.deque(maxlen=state_queue_len)
        
        self.get_reward_function = get_reward_function
        self.get_state_function = get_state_function
        
        for _ in range(state_queue_len):
            self._new_state()
        
    def reset(self):
        # stop current action
        self.system.reset()
        # Go to initial state

        # return state
#         return self.prepro([self.state])
      
    def go_to_init_state(self):
        self.system.go_to_init_state()
        print('#'*30)
        print('Going to Init')
        print('#'*30)
        time.sleep(3)
    
    @staticmethod
    def _color_from_one_state(s):
        return s[:3]
        
    def _environment_checks(self):
        # access color information from the last measurement 
        # according to self.new_state() ordering
        color = self._color_state_for_classifier
        
        if self.field_classifier.predict(color) == [0]:
            print('I am outside')
            self.border_count += 1

            if self.on_field:
                self.system.perform_actions([self.opposite_action[a] for a in self.current_action])
                print('BOUNCIN!!1')
                time.sleep(1)
            self.on_field = False
        else:
            self.on_field = True
    
        if self.border_count == 3:
            self.go_to_init_state()
            self.border_count = 0
            
    @property
    def _color_state_for_classifier(self):
        return np.array([self._color_from_one_state(s) for s in list(self.state_queue)[-2:]]).reshape(1,-1)
            
    def _new_reward_approx(self):
        def transform_proba_into_reward_approx(proba):
            return np.max([0., 5. * (proba - 0.3)])
            
        # Predict propba
        if not self.on_field:
            return -10
        
        colors = self._color_state_for_classifier
#         r = (np.argmax(self.reward_classifier.predict_proba(x), axis = 1) == 1).sum()
        # sum the probabilities of black class and compute a function of it
        black_proba = self.reward_classifier.predict_proba(colors)[:,1][0]
        
        self.reward_queue.append(transform_proba_into_reward_approx(black_proba))
        
    def _new_state(self):
        s = self.system.get_state()
        color = s['cs'][0]
        top_pos = s['top'][0]
        bot_pos = s['bot'][0]
        self.state_queue.append((*color, top_pos, bot_pos))
        
    def _cycle(self, is_free_cycle):
        if not is_free_cycle:
            print("Performing action", self.current_action)
            self.system.perform_actions(self.current_action)
        
        # gets new states and puts it in the queue
        self._new_state()
        
        # environment specific checks like is it still in the field
        self._environment_checks()
        
        # calculate the reward 
        self._new_reward_approx()
        
    def step(self, action, free_cycles=5):
        self.current_action = action
        self._cycle(False)  
        for _ in range(free_cycles):
            self._cycle(True)
        return self.state, self.reward, False, {}
    
    @property
    def reward(self):
        return self.get_reward_function(self.reward_queue)
        
    @property
    def state(self):
        return self.get_state_function(self.state_queue)
    
    @property
    def action_space(self):
        return len(self.system.get_action_space()[0])

def get_reward_function(reward_queue):
    # how many last reward_approx to take into consideration
    rewards = list(reward_queue)[-3:]
    
    # linear weights
    weights = np.ones(shape=(len(rewards),))
    weights = [weight * i for i, weight in enumerate(weights)]
    rewards_weighted = [weight * p for weight, p in zip(weights, rewards)]
    
    # custom function deciding what should be rewarded (because reward_approx is based on probability)
#     black_threshold = 0.3
#     r = np.max([0, (np.sum(rewards_weighted)-(black_threshold*len(rewards_weighted))) * 5])
    
    return np.mean(rewards_weighted)
    
def get_state_function(state_queue):
#     print(state_queue)
    state = list(state_queue)
    s = state[-5:]
    return np.array(s)

In [13]:
env = Environment('./mlp_on_off.pickle','./mlp_white_black.pickle', 
                  state_queue_len = 10, 
                  get_reward_function = get_reward_function,
                  get_state_function = get_state_function)

{'sensors': {'bot': <environment.system.EnvSensor object at 0x000001D98F636978>, 'top': <environment.system.EnvSensor object at 0x000001D98F636710>, 'cs': <environment.system.EnvSensor object at 0x000001D98F636748>}, 'actionables': {'bot': <environment.system.EnvActionable object at 0x000001D98F636940>, 'top': <environment.system.EnvActionable object at 0x000001D98F6369B0>}}


In [None]:
env.reset()

In [None]:
env.go_to_init_state()

In [None]:
env.step([2,2])

In [None]:
env.action_space

In [10]:
for i in range(100):
    a = np.random.randint(env.action_space, size = 2)
    print(env.step(list(a)))

Performing action [2, 2]
(array([[288, 258, 268, -15,  -8],
       [286, 255, 265, -22, -14],
       [286, 254, 265, -28, -20],
       [260, 207, 247, -34, -27],
       [ 30,  25,  27, -40, -32]]), 0.8292632230868603, False, {})
Performing action [0, 2]
I am outside


IndexError: list index out of range

In [None]:
env.system.perform_actions([1,1])

In [11]:
env.reset()

# Agent


The agent takes as input a vector/matrix and output a probability distribution

The action is taken using an argmax. Then reward is 1 or 0 then from the reward get the 

In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

from scipy.signal import lfilter
import matplotlib.pyplot as plt
%matplotlib inline

In [40]:
class Agent(nn.Module):
    def __init__(self, num_steps, features, num_actions, num_hidden = 5):
        super(Agent, self).__init__()
        self.layer1 = nn.Linear(num_steps*num_actions, num_hidden)
        self.layer2 = nn.Linear(num_hidden, num_actions)
        
    def forward(self, x):
      x = F.relu(self.layer1(x))
      x = F.softmax(self.layer2(x))
      return x
      
        
    

In [41]:
agent = Agent(num_steps = 3, features = 4, num_actions = 4)

In [42]:
x_rand = torch.rand(12)

In [43]:
agent(x_rand)

  if __name__ == '__main__':


tensor([0.3570, 0.2057, 0.1924, 0.2449], grad_fn=<SoftmaxBackward>)

In [None]:
num_episodes = 30

# Make an Agent

q_table = T_Agent(4, learn_rate = .8, gamma =.95)

#create lists to contain total rewards and steps per episode
env.reset()
rewards = []

stop_flag = False
for i in range(num_episodes):
    # Decay the exploration
    q_table.explore_decay = i
    
    s = env.go_to_init_state()
    rAll = 0
    
    d = False
    j = 0
    #The Q-Table learning algorithm
    try:
        while j < 99:
            j+=1

            #Choose an action by greedily (with noise) picking from Q table
            a = q_table.next_action(s)
            print('Action',a)
            #Get new state and reward from environment
            s1,r,d,_ = env.step(a)
            print('\r   ', r)

            #Update Q-Table with new knowledge
            q_table.update(r, s1)

            rAll += r
            s = s1
            if d == True:
                break
    except KeyboardInterrupt:
#         stop_flag = True
        env.reset()
        break

    rewards.append(rAll)
    print('#'*10, 'End Episode', '#'*10)
    
print("Average score over last part " +  str(sum(rewards[-500:])/500))

In [None]:
start = time.time()
print(env.state)
print(time.time()- start)

In [None]:
env.reset()

In [None]:
q_table.val_table.shape