In [1]:
import rpyc
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

import time
import utils
from tqdm import tqdm_notebook as tqdm

%load_ext autoreload
%autoreload 2

from environment.system import System
from agent.tabular_q_learning import Agent as T_Agent

Actions:
- 0 (rotation), 1 (other rotation), 2 (move outwards), 3 (move inwards)

In [25]:
import collections

class Environment():
    def __init__(self, field_classifier, reward_classifier, get_reward_function, get_state_function, state_queue_len=10):
        self.system = System(brick_ip='ev3dev.local', get_state_mode='dict')
        self.field_classifier = utils.load_pickle(field_classifier)
        self.reward_classifier = utils.load_pickle(reward_classifier)
        self.opposite_action = {0:3,1:2,2:1,3:0}
        
        self.on_field = True
        self.border_count = 0
#         self.color_on = color_on
        
        self.state_queue = collections.deque(maxlen=state_queue_len)
        self.reward_queue = collections.deque(maxlen=state_queue_len)
        
        self.get_reward_function = get_reward_function
        self.get_state_function = get_state_function
        
        for _ in range(state_queue_len):
            self._new_state()
        
    def reset(self):
        # stop current action
        self.system.reset()
        # Go to initial state

        # return state
#         return self.prepro([self.state])
      
    def go_to_init_state(self):
        self.system.go_to_init_state()
        print('#'*30)
        print('Going to Init')
        print('#'*30)
        time.sleep(3)
    
    @staticmethod
    def _color_from_one_state(s):
        return s[:3]
        
    def _environment_checks(self):
        # access color information from the last measurement 
        # according to self.new_state() ordering
        color = self._color_state_for_classifier
        
        if self.field_classifier.predict(color) == [0]:
            print('I am outside')
            self.border_count += 1

            if self.on_field:
                self.system.perform_actions([self.opposite_action[a] for a in self.current_action])
                print('BOUNCIN!!1')
                time.sleep(1)
            self.on_field = False
        else:
            self.on_field = True
    
        if self.border_count == 3:
            self.go_to_init_state()
            self.border_count = 0
            
    @property
    def _color_state_for_classifier(self):
        return np.array([self._color_from_one_state(s) for s in list(self.state_queue)[-2:]]).reshape(1,-1)
            
    def _calculate_reward(self):
        # Predict propba
        if not self.on_field:
            return -20
        
        weights = np.ones(shape=(self.state_queue.maxlen,))
        weights = [weight * i for i, weight in enumerate(weights)]
        
        colors = self._color_state_for_classifier
#         r = (np.argmax(self.reward_classifier.predict_proba(x), axis = 1) == 1).sum()
        # sum the probabilities of black class and compute a function of it
        black_proba = self.reward_classifier.predict_proba(colors)[:,1]
        black_proba_weighted = [weight * p for weight, p in zip(weights, black_proba)]
      
        black_threshold = 0.3
        r = np.max([0, (np.sum(black_proba_weighted)-(black_threshold*self.state_queue.maxlen)) * 5])
        self.reward_queue.append(r)  
        

    def _new_state(self):
        s = self.system.get_state()
        color = s['cs'][0]
        top_pos = s['top'][0]
        bot_pos = s['bot'][0]
        self.state_queue.append((*color, top_pos, bot_pos))
        
    def _cycle(self, is_free_cycle):
        if not is_free_cycle:
            print("Performing action", self.current_action)
            self.system.perform_actions(self.current_action)
        
        # gets new states and puts it in the queue
        self._new_state()
        
        # environment specific checks like is it still in the field
        self._environment_checks()
        
        # calculate the reward 
        self._calculate_reward()
        
    def step(self, action, free_cycles=5):
        self.current_action = action
        self._cycle(False)  
        for _ in range(free_cycles):
            self._cycle(True)
        return self.state, self.reward, False, {}
    
    @property
    def reward(self):
        return self.get_reward_function(self.reward_queue)
        
    @property
    def state(self):
        return self.get_state_function(self.state_queue)
    
    @property
    def action_space(self):
        return len(self.__env.get_action_space()[0])

def get_reward_function(reward_queue):
    return reward_queue[-1]
    
def get_state_function(state_queue):
    print(state_queue)
    state = list(state_queue)
    s = state[-5:]
    return np.array(s)

In [26]:
env = Environment('./mlp_on_off.pickle','./mlp_white_black.pickle', 
                  state_queue_len = 10, 
                  get_reward_function = get_reward_function,
                  get_state_function = get_state_function)



In [27]:
env.reset()

In [18]:
env.go_to_init_state()

##############################
Going to Init
##############################


In [19]:
env.step([2,2])

Performing action [2, 2]
I am outside
BOUNCIN!!1
I am outside
I am outside
##############################
Going to Init
##############################
I am outside
I am outside
deque([(246, 207, 223, 90, 16), (259, 217, 235, 96, 22), (235, 195, 205, 102, 29), (232, 190, 202, 107, 36), (173, 56, 58, 0, 3), (171, 57, 60, 0, 10), (182, 72, 58, 0, -22), (187, 120, 106, 0, -25), (152, 27, 27, 0, 0), (150, 26, 28, 0, 0)], maxlen=10)


(array([[171,  57,  60,   0,  10],
        [182,  72,  58,   0, -22],
        [187, 120, 106,   0, -25],
        [152,  27,  27,   0,   0],
        [150,  26,  28,   0,   0]]), 0.0, False, {})

In [28]:
for i in range(100):
    a = np.random.randint(4, size = 2)
    env.step(list(a))

Performing action [2, 0]
I am outside
BOUNCIN!!1
I am outside
BOUNCIN!!1
I am outside
##############################
Going to Init
##############################
deque([(255, 44, 65, 0, 0), (253, 43, 61, 0, 0), (255, 49, 63, 0, 0), (258, 42, 61, 0, 0), (28, 22, 25, -7, 1), (284, 254, 264, 75, -9), (291, 244, 262, 95, -9), (266, 47, 52, 110, -9), (280, 102, 112, 183, -17), (26, 25, 24, -5, 4)], maxlen=10)
Performing action [2, 2]
deque([(291, 244, 262, 95, -9), (266, 47, 52, 110, -9), (280, 102, 112, 183, -17), (26, 25, 24, -5, 4), (28, 18, 25, -3, 4), (27, 28, 28, 5, 5), (37, 35, 50, 11, 9), (257, 218, 230, 19, 14), (263, 228, 236, 26, 19), (262, 224, 238, 33, 25)], maxlen=10)
Performing action [0, 3]
I am outside
BOUNCIN!!1
I am outside
deque([(37, 35, 50, 11, 9), (257, 218, 230, 19, 14), (263, 228, 236, 26, 19), (262, 224, 238, 33, 25), (224, 195, 196, 45, 22), (150, 137, 158, 56, 9), (283, 248, 266, 64, -3), (275, 237, 255, 74, -15), (258, 73, 74, 85, -28), (260, 222, 230, -19, 70)]

I am outside
BOUNCIN!!1
##############################
Going to Init
##############################
I am outside
deque([(146, 145, 97, 37, -18), (284, 254, 267, 48, -30), (287, 248, 263, 59, -43), (274, 57, 72, 67, -55), (257, 219, 231, -39, 35), (264, 221, 238, -53, 19), (252, 61, 58, -65, 1), (29, 22, 21, -15, -3), (28, 26, 25, -15, -3), (25, 26, 21, -15, -3)], maxlen=10)
Performing action [1, 3]
deque([(252, 61, 58, -65, 1), (29, 22, 21, -15, -3), (28, 26, 25, -15, -3), (25, 26, 21, -15, -3), (32, 37, 29, -9, -4), (31, 26, 25, 3, -4), (190, 149, 221, 15, -4), (246, 201, 230, 28, -4), (37, 26, 33, 39, -4), (139, 143, 133, 51, -4)], maxlen=10)
Performing action [2, 2]
deque([(190, 149, 221, 15, -4), (246, 201, 230, 28, -4), (37, 26, 33, 39, -4), (139, 143, 133, 51, -4), (266, 231, 236, 58, 0), (266, 226, 238, 58, 7), (262, 223, 234, 58, 13), (261, 225, 234, 58, 19), (269, 232, 245, 63, 24), (276, 238, 252, 72, 31)], maxlen=10)
Performing action [1, 1]
deque([(262, 223, 234, 58, 13), (

deque([(268, 45, 41, 33, 180), (32, 30, 29, -10, -3), (26, 29, 29, -10, -3), (28, 29, 24, -10, -3), (27, 22, 20, -12, 4), (32, 21, 49, -18, 19), (258, 225, 242, -21, 33), (262, 222, 232, -21, 46), (265, 230, 235, -21, 59), (261, 218, 232, -21, 71)], maxlen=10)
Performing action [2, 1]
deque([(258, 225, 242, -21, 33), (262, 222, 232, -21, 46), (265, 230, 235, -21, 59), (261, 218, 232, -21, 71), (258, 220, 230, -21, 82), (259, 218, 231, -21, 83), (260, 226, 234, -21, 84), (258, 219, 229, -21, 85), (259, 221, 235, -21, 85), (257, 220, 226, -21, 85)], maxlen=10)
Performing action [2, 3]
deque([(260, 226, 234, -21, 84), (258, 219, 229, -21, 85), (259, 221, 235, -21, 85), (257, 220, 226, -21, 85), (268, 229, 233, -13, 85), (281, 250, 259, 3, 86), (280, 247, 252, 17, 89), (285, 250, 260, 30, 97), (286, 256, 261, 42, 104), (284, 256, 266, 54, 111)], maxlen=10)
Performing action [1, 2]
deque([(280, 247, 252, 17, 89), (285, 250, 260, 30, 97), (286, 256, 261, 42, 104), (284, 256, 266, 54, 111), (

deque([(281, 246, 260, 78, 47), (284, 249, 264, 78, 43), (285, 247, 254, 78, 39), (278, 249, 255, 78, 33), (284, 247, 260, 74, 29), (274, 234, 241, 67, 25), (269, 231, 240, 60, 19), (260, 223, 231, 53, 15), (114, 99, 91, 48, 10), (30, 20, 18, 41, 5)], maxlen=10)
Performing action [1, 0]
I am outside
BOUNCIN!!1
##############################
Going to Init
##############################
deque([(269, 231, 240, 60, 19), (260, 223, 231, 53, 15), (114, 99, 91, 48, 10), (30, 20, 18, 41, 5), (30, 21, 23, 28, -3), (235, 207, 204, -3, -9), (244, 204, 198, -3, -9), (240, 204, 200, -3, -9), (240, 205, 199, -3, -9), (243, 206, 201, -3, -9)], maxlen=10)
Performing action [0, 2]
I am outside
BOUNCIN!!1
deque([(244, 204, 198, -3, -9), (240, 204, 200, -3, -9), (240, 205, 199, -3, -9), (243, 206, 201, -3, -9), (248, 214, 209, -1, -16), (279, 244, 257, 6, -31), (283, 248, 262, 10, -45), (285, 256, 274, 12, -57), (289, 254, 266, 13, -69), (270, 64, 65, 15, -81)], maxlen=10)
Performing action [3, 2]
I am o

In [None]:
env.system.perform_actions([1,1])

In [29]:
env.reset()

# Agent


The agent takes as input a vector/matrix and output a probability distribution

The action is taken using an argmax. Then reward is 1 or 0 then from the reward get the 

In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

from scipy.signal import lfilter
import matplotlib.pyplot as plt
%matplotlib inline

In [40]:
class Agent(nn.Module):
    def __init__(self, num_steps, features, num_actions, num_hidden = 5):
        super(Agent, self).__init__()
        self.layer1 = nn.Linear(num_steps*num_actions, num_hidden)
        self.layer2 = nn.Linear(num_hidden, num_actions)
        
    def forward(self, x):
      x = F.relu(self.layer1(x))
      x = F.softmax(self.layer2(x))
      return x
      
        
    

In [41]:
agent = Agent(num_steps = 3, features = 4, num_actions = 4)

In [42]:
x_rand = torch.rand(12)

In [43]:
agent(x_rand)

  if __name__ == '__main__':


tensor([0.3570, 0.2057, 0.1924, 0.2449], grad_fn=<SoftmaxBackward>)

In [None]:
num_episodes = 30

# Make an Agent

q_table = T_Agent(4, learn_rate = .8, gamma =.95)

#create lists to contain total rewards and steps per episode
env.reset()
rewards = []

stop_flag = False
for i in range(num_episodes):
    # Decay the exploration
    q_table.explore_decay = i
    
    s = env.go_to_init_state()
    rAll = 0
    
    d = False
    j = 0
    #The Q-Table learning algorithm
    try:
        while j < 99:
            j+=1

            #Choose an action by greedily (with noise) picking from Q table
            a = q_table.next_action(s)
            print('Action',a)
            #Get new state and reward from environment
            s1,r,d,_ = env.step(a)
            print('\r   ', r)

            #Update Q-Table with new knowledge
            q_table.update(r, s1)

            rAll += r
            s = s1
            if d == True:
                break
    except KeyboardInterrupt:
#         stop_flag = True
        env.reset()
        break

    rewards.append(rAll)
    print('#'*10, 'End Episode', '#'*10)
    
print("Average score over last part " +  str(sum(rewards[-500:])/500))

In [None]:
start = time.time()
print(env.state)
print(time.time()- start)

In [None]:
env.reset()

In [None]:
q_table.val_table.shape