In [None]:
import rpyc
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

import time
import utils
from tqdm import tqdm_notebook as tqdm

%load_ext autoreload
%autoreload 2

from environment.system import System

from agent.tabular_q_learning import Agent as T_Agent


Actions:
- 0 (rotation), 1 (other rotation), 2 (move outwards), 3 (move inwards)

In [2]:
class Environment():
    def __init__(self, field_classifier, reward_classifier, delta_measurement = .05, num_measurements = 10, color_on = True):
        self.__env = System(brick_ip='ev3dev.local', get_state_mode='dict')
        self.delta_measurement = delta_measurement
        self.num_measurements = num_measurements
        self.field_classifier = utils.load_pickle(field_classifier)
        self.reward_classifier = utils.load_pickle(reward_classifier)
        self.opposite_action = {0:1,1:0,2:3,3:2}
        
        self.on_field = True
        self.color_on = color_on
        
    def reset(self):
        # stop current action
        self.__env.reset()
        # Go to initial state

        # return state
        return self.prepro([self.state])
      
    def go_to_init_state(self):
        self.__env.go_to_init_state()
        print('#'*30)
        print('Going to Init')
        print('#'*30)
        time.sleep(5)

    def step(self, action):
        # give the action to the motors
        self.__env.perform_actions([action])
        
        state = []
        done = False
        
        # we will perform this action for 
        measurement = 0
        
        border_count = 0
        
        while measurement < self.num_measurements:
            start = time.time()
            time_arr = []
            # Get the current state
            s = self.get_state()
            state.append(s)
            start_1 = time.time()
            time_arr.append(start_1-start)
            
            measurement += 1
            
            #Sleep a bit so next time we get a different state
            time.sleep(self.delta_measurement)
            start_2 = time.time()
            time_arr.append(start_2-start_1)
            
            # A check whether we are still in the field
            if self.color_on:
                if self.field_classifier.predict(s['raw_col']) == [0]:
                    print('I am outside')
                    border_count += 1
            
                    if self.on_field:
                        self.__env.perform_actions([self.opposite_action[action]])
                        print('BOUNCIN!!1')
                        time.sleep(1)
                    self.on_field = False
                else:
                    self.on_field = True
            
            if border_count ==3:
                self.go_to_init_state()
                border_count = 0
                
            time_arr.append(time.time()-start_2)    
        # Stop the actions
        self.__env.stop()
        
        # Calculate the intermediate reward
        if self.color_on:
            reward = self.calculate_reward(state)
        else:
            reward = 0
        
        return state[-1]['index'], reward, done, {}
      
    def calculate_reward(self, state):
        # Predict propba
        if not self.on_field:
          return -20
        
        weights = np.ones(shape=(self.num_measurements,))
        weights = [weight * i for i, weight in enumerate(weights)]
        x = np.array([s['raw_col'] for s in state]).squeeze()
#         r = (np.argmax(self.reward_classifier.predict_proba(x), axis = 1) == 1).sum()
        # sum the probabilities of black class and compute a function of it
        black_proba = self.reward_classifier.predict_proba(x)[:,1]
        black_proba_weighted = [weight * p for weight, p in zip(weights, black_proba)]
      
        black_threshold = 0.3
        r = np.max([0, (np.sum(black_proba_weighted)-(black_threshold*self.num_measurements)) * 5])
        return r
    
    def prepro(self,state):
        # Deprecate this shit, preprocessing will be done in retrieving the get_state.
        s = state[-1]
        if self.color_on:
            x = (s['cs'][0][0]//10,s['cs'][0][1]//10,s['cs'][0][2]//10, s['bot'][0]//36, s['top'][0]//36)
        else:
            x = (s['bot'][0]//36, s['top'][0]//36)
        return x
      
    def get_state(self):
        s_1 = self.state
        s_2 = self.state
        
        s_2['bot'] = s_2['bot'][0]//36
        s_2['top'] = s_2['top'][0]//36
        
        if self.color_on:
            col = np.r_[s_1['cs'][0],s_2['cs'][0]]
            col_ind = col//3
            
            s = {'index': (*tuple(col_ind),s_2['bot'], s_2['top']), 'raw_col' : np.array([col])}
        else:
            s = (s_2['bot'], s_2['top'])
        return s
      
    @property
    def state(self):
      return self.__env.get_state()
    
    @property
    def action_space(self):
      return len(self.__env.get_action_space()[0])
      

    

In [3]:
env = Environment('./mlp_on_off.pickle','./mlp_white_black.pickle', delta_measurement= 0.0, num_measurements = 3)



In [4]:
env.reset()

(27, 23, 25, 0, 0)

In [5]:
env.get_state()

{'index': (92, 80, 85, 91, 78, 85, 0, 0),
 'raw_col': array([[277, 241, 257, 274, 234, 257]])}

In [6]:
num_episodes = 30

# Make an Agent

q_table = T_Agent(4, learn_rate = .8, gamma =.95)

#create lists to contain total rewards and steps per episode
env.reset()
rewards = []

for i in range(num_episodes):
    # Decay the exploration
    q_table.explore_decay = i
    
    s = env.go_to_init_state()
    rAll = 0
    
    d = False
    j = 0
    #The Q-Table learning algorithm
    while j < 99:
        j+=1
       
        #Choose an action by greedily (with noise) picking from Q table
        a = q_table.next_action(s)
        print('Action',a)
        #Get new state and reward from environment
        s1,r,d,_ = env.step(a)
        print('\r   ', r)
        
        #Update Q-Table with new knowledge
        q_table.update(r, s1)

        rAll += r
        s = s1
        if d == True:
            break

    rewards.append(rAll)
    print('#'*10, 'End Episode', '#'*10)
    
print("Average score over last part " +  str(sum(rewards[-500:])/500))

##############################
Going to Init
##############################
Action 2
    0.0
Action 2
    5.443958513527857
Action 1
    0.0
Action 1
    0.0
Action 0
    0.0
Action 1
    0.0
Action 1
    0.0
Action 1
    0.0
Action 2
I am outside
BOUNCIN!!1
    -20
Action 2
    0.0
Action 3
I am outside
BOUNCIN!!1
I am outside
I am outside
##############################
Going to Init
##############################
    -20
Action 3
    0.0
Action 0
    0.0
Action 1
    0.0
Action 0
    0.0
Action 3
    0.0
Action 2
I am outside
BOUNCIN!!1
I am outside
I am outside
##############################
Going to Init
##############################
    -20
Action 0
    0.0
Action 3
    0.0
Action 3
I am outside
BOUNCIN!!1
    -20
Action 0
    0.0
Action 3
I am outside
BOUNCIN!!1
    -20
Action 0
    0.0
Action 2
    0.0
Action 1
    0.0
Action 0
    0.0
Action 2
    6.9295740699709985
Action 2
    0.0
Action 2
    0.0
Action 0
I am outside
BOUNCIN!!1
I am outside
I am outside
###################

EOFError: [Errno 60] Operation timed out

In [None]:
start = time.time()
print(env.state)
print(time.time()- start)

In [None]:
env.reset()

In [None]:
q_table.val_table.shape