In [1]:
import rpyc
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

import time
import utils
from tqdm import tqdm_notebook as tqdm

%load_ext autoreload
%autoreload 2

from environment.system import System
from agent.tabular_q_learning import Agent as T_Agent

Actions:
- 0 (rotation), 1 (other rotation), 2 (move outwards), 3 (move inwards)

In [100]:
import collections

class Environment():
    def __init__(self, field_classifier, reward_classifier, get_reward_function, get_state_function, state_queue_len=10):
        self.system = System(brick_ip='ev3dev.local', get_state_mode='dict')
        self.field_classifier = utils.load_pickle(field_classifier)
        self.reward_classifier = utils.load_pickle(reward_classifier)
        self.opposite_action = {0:3,1:2,2:1,3:0}
        self.opposite_action = {i: self.action_space-i-1 for i in range(self.action_space)}
        
        self.on_field = True
        self.border_count = 0
#         self.color_on = color_on
        
        self.state_queue = collections.deque(maxlen=state_queue_len)
        self.reward_queue = collections.deque(maxlen=state_queue_len)
        
        self.get_reward_function = get_reward_function
        self.get_state_function = get_state_function
        
        for _ in range(state_queue_len):
            self._new_state()
        
    def reset(self):
        # stop current action
        self.system.reset()
        # Go to initial state

        # return state
#         return self.prepro([self.state])
      
    def go_to_init_state(self):
        print('#'*30)
        print('Going to Init')
        print('#'*30)
        self.system.go_to_init_state()
#         time.sleep(3)
    
    @staticmethod
    def _color_from_one_state(s):
        return s[:3]
        
    def _environment_checks(self):
        # access color information from the last measurement 
        # according to self.new_state() ordering
        color = self._color_state_for_classifier
        
        if self.field_classifier.predict(color) == [0]:
            print('I am outside')
            self.border_count += 1

            if self.on_field:
                self.system.perform_actions([self.opposite_action[a] for a in self.current_action])
                print('BOUNCIN!!1')
                time.sleep(1)
            self.on_field = False
        else:
            self.on_field = True
    
        if self.border_count == 3:
            self.go_to_init_state()
            self.border_count = 0
            
    @property
    def _color_state_for_classifier(self):
        return np.array([self._color_from_one_state(s) for s in list(self.state_queue)[-2:]]).reshape(1,-1)
            
    def _new_reward_approx(self):
        def transform_proba_into_reward_approx(proba):
            return np.max([0., 5. * (proba - 0.3)])
            
        # Predict propba
        if not self.on_field:
            return -10
        
        colors = self._color_state_for_classifier
#         r = (np.argmax(self.reward_classifier.predict_proba(x), axis = 1) == 1).sum()
        # sum the probabilities of black class and compute a function of it
        black_proba = self.reward_classifier.predict_proba(colors)[:,1][0]
        
        self.reward_queue.append(transform_proba_into_reward_approx(black_proba))
        
    def _new_state(self):
        s = self.system.get_state()
        color = s['cs'][0]
        top_pos = s['top'][0]
        bot_pos = s['bot'][0]
        self.state_queue.append((*color, top_pos, bot_pos))
        
    def _cycle(self, is_free_cycle):
        if not is_free_cycle:
            print("Performing action", self.current_action)
            self.system.perform_actions(self.current_action)
        
        # gets new states and puts it in the queue
        self._new_state()
        
        # environment specific checks like is it still in the field
        self._environment_checks()
        
        # calculate the reward 
        self._new_reward_approx()
        
    def step(self, action, free_cycles=5):
        self.current_action = action
        self._cycle(False)  
        for _ in range(free_cycles):
            self._cycle(True)
        return self.state, self.reward, False, {}
    
    @property
    def reward(self):
        return self.get_reward_function(self.reward_queue)
        
    @property
    def state(self):
        return self.get_state_function(self.state_queue)
    
    @property
    def action_space(self):
        return len(self.system.get_action_space()[0])

def get_reward_function(reward_queue):
    # how many last reward_approx to take into consideration
    rewards = list(reward_queue)[-3:]
    
    # linear weights
    weights = np.ones(shape=(len(rewards),))
    weights = [weight * i for i, weight in enumerate(weights)]
    rewards_weighted = [weight * p for weight, p in zip(weights, rewards)]
    
    # custom function deciding what should be rewarded (because reward_approx is based on probability)
#     black_threshold = 0.3
#     r = np.max([0, (np.sum(rewards_weighted)-(black_threshold*len(rewards_weighted))) * 5])
    
    return np.mean(rewards_weighted)
    
def get_state_function(state_queue):
#     print(state_queue)
    state = list(state_queue)
    s = state[-5:]
    return np.array(s)

In [3]:
env = Environment('./mlp_on_off.pickle','./mlp_white_black.pickle', 
                  state_queue_len = 10, 
                  get_reward_function = get_reward_function,
                  get_state_function = get_state_function)

{'sensors': {'bot': <environment.system.EnvSensor object at 0x10fa34630>, 'top': <environment.system.EnvSensor object at 0x10fa44cc0>, 'cs': <environment.system.EnvSensor object at 0x10fa44d30>}, 'actionables': {'bot': <environment.system.EnvActionable object at 0x10fa34668>, 'top': <environment.system.EnvActionable object at 0x10fa346a0>}}




In [5]:
env.go_to_init_state()

##############################
Going to Init
##############################


In [None]:
env.step([2,2])

In [None]:
env.action_space

In [4]:
env.reset()
for i in range(100):
    a = np.random.randint(env.action_space, size = 2)
    print(env.step(list(a)))

Performing action [6, 5]
(array([[23, 23, 34,  3, 10],
       [23, 23, 33,  9, 16],
       [22, 21, 31, 13, 21],
       [22, 22, 32, 19, 26],
       [28, 31, 34, 23, 31]]), 3.4898517182817934, False, {})
Performing action [2, 6]
(array([[ 37,  41,  37,  33,  24],
       [ 48,  49,  69,  35,  18],
       [227, 243, 286,  38,  13],
       [243, 263, 301,  41,   8],
       [244, 264, 307,  45,   4]]), 0.0, False, {})
Performing action [6, 6]
(array([[244, 266, 314,  51,  14],
       [240, 263, 307,  55,  23],
       [238, 261, 305,  61,  30],
       [239, 262, 308,  68,  36],
       [239, 262, 307,  75,  43]]), 0.0, False, {})
Performing action [1, 1]
(array([[240, 263, 306,  66,  36],
       [237, 260, 303,  56,  26],
       [241, 265, 308,  46,  16],
       [241, 263, 308,  36,   6],
       [242, 264, 309,  27,  -3]]), 0.0, False, {})
Performing action [3, 7]
I am outside
BOUNCIN!!1
(array([[239, 246, 282,  31, -19],
       [221, 153, 179,  37, -22],
       [229, 158, 189,  43, -24],
  

(array([[145, 158, 221,  12,  -9],
       [242, 262, 295,  15, -15],
       [241, 264, 302,  18, -20],
       [236, 259, 303,  21, -26],
       [226, 222, 249,  23, -31]]), 0.0, False, {})
Performing action [3, 2]
(array([[222, 183, 191,  14, -35],
       [226, 188, 194,   7, -35],
       [230, 203, 192,   1, -35],
       [232, 232, 246,  -3, -35],
       [233, 245, 267,  -8, -35]]), 0.0, False, {})
Performing action [5, 3]
(array([[239, 265, 301, -10, -30],
       [241, 267, 300, -10, -28],
       [240, 267, 302, -10, -28],
       [240, 266, 301, -10, -28],
       [240, 265, 300, -10, -28]]), 0.0, False, {})
Performing action [1, 5]
(array([[243, 266, 302,   0, -42],
       [240, 257, 291,   7, -50],
       [234, 204, 194,   9, -56],
       [221, 156, 174,  11, -60],
       [220, 149, 189,  12, -64]]), 0.0, False, {})
Performing action [0, 0]
I am outside
BOUNCIN!!1
I am outside
I am outside
##############################
Going to Init
##############################
(array([[214, 153,

(array([[ 36,  38,  54,  -5,  13],
       [ 61,  70,  63, -10,  18],
       [189, 210, 255, -13,  22],
       [220, 243, 301, -16,  26],
       [214, 239, 287, -19,  30]]), 0.0, False, {})
Performing action [5, 7]
(array([[216, 231, 289,  -5,  34],
       [219, 239, 293,   4,  34],
       [221, 240, 296,  10,  38],
       [219, 238, 294,  16,  45],
       [219, 230, 291,  22,  52]]), 0.0, False, {})
Performing action [7, 7]
I am outside
BOUNCIN!!1
(array([[216, 185, 225,  39,  72],
       [207, 138, 163,  48,  80],
       [211, 124, 158,  56,  88],
       [221, 234, 296, -37,  -4],
       [223, 200, 283, -47, -16]]), 0.0, False, {})
Performing action [1, 6]
(array([[233, 240, 307, -45, -37],
       [233, 254, 303, -41, -42],
       [235, 262, 299, -37, -47],
       [192, 220, 218, -34, -52],
       [ 52,  67,  44, -30, -57]]), 0.93372986070428, False, {})
Performing action [5, 6]
(array([[ 22,  20,  40, -22, -50],
       [ 44,  44,  85, -17, -43],
       [ 98, 100, 175, -13, -36],
    

In [None]:
env.system.perform_actions([1,1])

In [8]:
env.reward_queue, env.state_queue

(deque([0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        2.079713820024192,
        3.374547125265779,
        0.0,
        0.0]),
 deque([(181, 191, 269, -9, -20),
        (221, 237, 285, -4, -16),
        (233, 252, 293, 1, -11),
        (229, 253, 296, 6, -6),
        (235, 262, 296, 3, 0),
        (183, 187, 253, -5, 6),
        (22, 23, 34, -11, 10),
        (103, 122, 128, -16, 14),
        (214, 239, 295, -21, 19),
        (210, 234, 280, -26, 23)]))

In [9]:
env.reset()

In [12]:
env.state_queue

deque([(181, 191, 269, -9, -20),
       (221, 237, 285, -4, -16),
       (233, 252, 293, 1, -11),
       (229, 253, 296, 6, -6),
       (235, 262, 296, 3, 0),
       (183, 187, 253, -5, 6),
       (22, 23, 34, -11, 10),
       (103, 122, 128, -16, 14),
       (214, 239, 295, -21, 19),
       (210, 234, 280, -26, 23)])

# Agent


The agent takes as input a vector/matrix and output a probability distribution

The action is taken using an argmax. Then reward is 1 or 0 then from the reward get the 

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

from scipy.signal import lfilter
import matplotlib.pyplot as plt
%matplotlib inline

In [84]:
class Agent(nn.Module):
    def __init__(self, num_steps, num_features, num_actions, num_hidden = 5):
        super(Agent, self).__init__()
        self.layer1 = nn.Linear(num_steps*num_features, num_hidden)
        self.layer2 = nn.Linear(num_hidden, num_actions)
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.softmax(self.layer2(x))
        return x
      
      
      
criterion = torch.nn.BCELoss()


In [101]:
print(np.clip.__doc__)


    Clip (limit) the values in an array.

    Given an interval, values outside the interval are clipped to
    the interval edges.  For example, if an interval of ``[0, 1]``
    is specified, values smaller than 0 become 0, and values larger
    than 1 become 1.

    Parameters
    ----------
    a : array_like
        Array containing elements to clip.
    a_min : scalar or array_like or `None`
        Minimum value. If `None`, clipping is not performed on lower
        interval edge. Not more than one of `a_min` and `a_max` may be
        `None`.
    a_max : scalar or array_like or `None`
        Maximum value. If `None`, clipping is not performed on upper
        interval edge. Not more than one of `a_min` and `a_max` may be
        `None`. If `a_min` or `a_max` are array_like, then the three
        arrays will be broadcasted to match their shapes.
    out : ndarray, optional
        The results will be placed in this array. It may be the input
        array for in-place clipping

In [86]:
env.reset()

In [128]:
def find_line_state(state_queue):
    s = np.array(state_queue)[-3:,-2:]
    return torch.FloatTensor(s).view(-1)
  
def find_line_reward(reward_queue):
    r = np.array(reward_queue)[-5:].mean()
    if r > .8:
        return 1
    else:
        return 0
    
  
env = Environment('./mlp_on_off.pickle','./mlp_white_black.pickle', 
                  state_queue_len = 10, 
                  get_reward_function = find_line_reward,
                  get_state_function = find_line_state)

agent = Agent(num_steps = 3, num_features = 2, num_actions = 8, num_hidden = 5)

optimizer = torch.optim.Adam(agent.parameters(),lr= .1)



{'sensors': {'bot': <environment.system.EnvSensor object at 0x115d262e8>, 'top': <environment.system.EnvSensor object at 0x115d26080>, 'cs': <environment.system.EnvSensor object at 0x115d260b8>}, 'actionables': {'bot': <environment.system.EnvActionable object at 0x115d262b0>, 'top': <environment.system.EnvActionable object at 0x115d26320>}}




In [129]:
#training:
# Input for this classifier: previous 3 positions (x_1, y_1, x_2, y_2, x_3, y_3) output get action
# each 10 steps update the weights
# Decide when the action gives 1 for right and 0 for wrong (so a 1-0 reward)
# Then use the Batch Cross Entropy Loss
# Tadamorrow

train_steps = 10
batch_size = 50

env.reset()
state = env.state
for i in range(train_steps):
    pred = []
    true = []
    
    for j in range(batch_size):
        action_prob = agent(state)
    
        action = torch.multinomial(action_prop, 1).detach().numpy()[0]

        state, rew, _, _ = env.step([action//2,action%2])
        pred.append(action_prob[action])

        true.append(rew)


    pred = torch.stack(pred)
    true = torch.FloatTensor(true)
    loss = criterion(pred, true)
    agent.zero_grad()
    loss.backward()
    optimizer.step()

    pred = []
    true = []
    
env.go_to_init_state()
        
        
  

#loss = criterion(X,Y)
#model.zero_grad()
#loss.backward()
#optimizer.step()


  if __name__ == '__main__':


Performing action [2, 1]
Performing action [2, 1]
I am outside
BOUNCIN!!1
I am outside
BOUNCIN!!1
I am outside
##############################
Going to Init
##############################
Performing action [0, 1]
I am outside
BOUNCIN!!1
I am outside
Performing action [2, 1]
I am outside
##############################
Going to Init
##############################
Performing action [2, 1]
Performing action [2, 1]
Performing action [2, 1]
Performing action [2, 1]
Performing action [0, 0]
I am outside
BOUNCIN!!1
I am outside
Performing action [0, 1]
I am outside
##############################
Going to Init
##############################
Performing action [2, 1]
Performing action [2, 1]
Performing action [2, 1]
Performing action [0, 1]
I am outside
BOUNCIN!!1
I am outside
Performing action [2, 1]
I am outside
##############################
Going to Init
##############################
Performing action [2, 1]
Performing action [2, 1]
Performing action [2, 1]
Performing action [0, 1]
I am outsi

Performing action [0, 1]
I am outside
BOUNCIN!!1
I am outside
I am outside
##############################
Going to Init
##############################
Performing action [2, 1]
Performing action [0, 1]
I am outside
BOUNCIN!!1
I am outside
I am outside
##############################
Going to Init
##############################
Performing action [2, 1]
Performing action [2, 1]
I am outside
BOUNCIN!!1
I am outside
I am outside
BOUNCIN!!1
##############################
Going to Init
##############################
Performing action [3, 1]
Performing action [2, 1]
Performing action [2, 1]
Performing action [2, 1]
Performing action [0, 0]
I am outside
BOUNCIN!!1
Performing action [2, 1]
I am outside
I am outside
##############################
Going to Init
##############################
Performing action [2, 1]
Performing action [2, 1]
Performing action [2, 1]
Performing action [2, 1]
Performing action [2, 1]
Performing action [0, 0]
I am outside
BOUNCIN!!1
I am outside
I am outside
BOUNCIN!!1

KeyboardInterrupt: 

In [127]:
env.go_to_init_state()

##############################
Going to Init
##############################


In [114]:
x = [torch.rand(1) for _ in range(10)]
print(torch.tensor(x))
y = torch.randint(2,size = (10,))
x/x.sum()

tensor([0.1408, 0.6992, 0.8653, 0.3987, 0.2965, 0.2274, 0.2707, 0.6729, 0.2036,
        0.9388])


AttributeError: 'list' object has no attribute 'sum'

In [72]:
torch.multinomial(x/x.sum(), 1)

tensor([2])

In [17]:
x_np, y_np = x.numpy, y.numpy

(y*np.log(x) +(1-y)*np.log(1-x)).mean()

tensor(-1.3572)

In [41]:
agent = Agent(num_steps = 3, features = 4, num_actions = 4)

In [42]:
x_rand = torch.rand(12)

In [43]:
agent(x_rand)

  if __name__ == '__main__':


tensor([0.3570, 0.2057, 0.1924, 0.2449], grad_fn=<SoftmaxBackward>)

In [None]:
num_episodes = 30

# Make an Agent

q_table = T_Agent(4, learn_rate = .8, gamma =.95)

#create lists to contain total rewards and steps per episode
env.reset()
rewards = []

stop_flag = False
for i in range(num_episodes):
    # Decay the exploration
    q_table.explore_decay = i
    
    s = env.go_to_init_state()
    rAll = 0
    
    d = False
    j = 0
    #The Q-Table learning algorithm
    try:
        while j < 99:
            j+=1

            #Choose an action by greedily (with noise) picking from Q table
            a = q_table.next_action(s)
            print('Action',a)
            #Get new state and reward from environment
            s1,r,d,_ = env.step(a)
            print('\r   ', r)

            #Update Q-Table with new knowledge
            q_table.update(r, s1)

            rAll += r
            s = s1
            if d == True:
                break
    except KeyboardInterrupt:
#         stop_flag = True
        env.reset()
        break

    rewards.append(rAll)
    print('#'*10, 'End Episode', '#'*10)
    
print("Average score over last part " +  str(sum(rewards[-500:])/500))

In [None]:
start = time.time()
print(env.state)
print(time.time()- start)

In [None]:
env.reset()

In [None]:
q_table.val_table.shape