In [29]:
import rpyc
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

import time
import utils
from tqdm import tqdm_notebook as tqdm

%load_ext autoreload
%autoreload 2

from environment.system import System

from agent.tabular_q_learning import Agent as T_Agent


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Actions:
- 0 (rotation), 1 (other rotation), 2 (move outwards), 3 (move inwards)

In [123]:
class Environment():
    def __init__(self, field_classifier, reward_classifier, delta_measurement = .05, num_measurements = 10):
        self.__env = System(brick_ip='ev3dev.local', get_state_mode='dict')
        self.delta_measurement = delta_measurement
        self.num_measurements = num_measurements
        self.field_classifier = utils.load_pickle(field_classifier)
        self.reward_classifier = utils.load_pickle(reward_classifier)
        self.opposite_action = {0:1,1:0,2:3,3:2}
        
        self.on_field = True

    def reset(self):
        # stop current action
        self.__env.reset()
        # Go to initial state

        # return state
        return self.prepro([self.state])

    def step(self, action):
        # give the action to the motors
        self.__env.perform_actions([action])
        
        state = []
        done = False
        
        # we will perform this action for 
        measurement = 0
        

        while measurement < self.num_measurements:
            start = time.time()
            time_arr = []
            # Get the current state
            s = self.state
            state.append(s)
            start_1 = time.time()
            time_arr.append(start_1-start)
            
            measurement += 1
            
            #Sleep a bit so next time we get a different state
            time.sleep(self.delta_measurement)
            start_2 = time.time()
            time_arr.append(start_2-start_1)
            
            # A check whether we are still in the field
            if self.field_classifier.predict(np.array(s['cs']).reshape(1,-1)) == [0]:
                print('I am outside')
                # if not then do the opposite actions
                if self.on_field:
                  self.__env.perform_actions([self.opposite_action[action]])
                  print('BOUNCIN!!1')
                  time.sleep(1)
                self.on_field = False
            else:
                self.on_field = True
                
            time_arr.append(time.time()-start_2)    
        # Stop the actions
        self.__env.stop()
        
        # Calculate the intermediate reward
        reward = self.calculate_reward(state)
        
        return self.prepro(state), reward, done, {}
      
    def calculate_reward(self, state):
        # Predict propba
        if not self.on_field:
          return -20
    
        x = np.array([s['cs'] for s in state]).squeeze()
        r = (np.argmax(self.reward_classifier.predict_proba(x), axis = 1) == 1).sum()
        return r
    
    def prepro(self,state):
        s = state[-1]
        x = (s['cs'][0][0]//10,s['cs'][0][1]//10,s['cs'][0][2]//10, s['bot'][0]//36, s['top'][0]//36)
        return x
    @property
    def state(self):
      return self.__env.get_state()
    
    
    @property
    def action_space(self):
      return len(self.__env.get_action_space()[0])
      

    

In [124]:
env = Environment('./mlp_on_off.pickle','./mlp_white_black.pickle', delta_measurement= 0.0, num_measurements = 5)



In [125]:
env.state

{'bot': (-115,), 'cs': ((179, 167, 221),), 'top': (59,)}

In [126]:
env.action_space

4

In [128]:
env.reset()

(20, 14, 18, 0, 0)

In [129]:
num_episodes = 30

# Make an Agent

q_table = T_Agent(4, learn_rate = .8, gamma =.95)

#create lists to contain total rewards and steps per episode

rewards = []

for i in range(num_episodes):
    # Decay the exploration
    q_table.explore_decay = i
    
    s = env.reset()
    rAll = 0
    
    d = False
    j = 0
    #The Q-Table learning algorithm
    while j < 99:
        j+=1
       
        #Choose an action by greedily (with noise) picking from Q table
        a = q_table.next_action(s)
        print('Action',a)
        #Get new state and reward from environment
        s1,r,d,_ = env.step(a)
        print('\r   ', r)
        
        #Update Q-Table with new knowledge
        q_table.update(r, s1)

        rAll += r
        s = s1
        if d == True:
            break

    rewards.append(rAll)
print("Average score over last part " +  str(sum(rewards[-500:])/500))

Action 1
    0
Action 0
    0
Action 3
    1
Action 0
    1
Action 0
    0
Action 1
    0
Action 3
I am outside
BOUNCIN!!1
    -20
Action 2
    1
Action 1
    1
Action 1
    0
Action 3
    0
Action 1
    3
Action 2
    4
Action 0
    0
Action 1
    0
Action 1
    2
Action 1
I am outside
BOUNCIN!!1
    2
Action 3
    2
Action 0
    3
Action 1
    4
Action 3
    2
Action 3
I am outside
BOUNCIN!!1
    3
Action 0
    0
Action 2
    2
Action 2
I am outside
BOUNCIN!!1
    1
Action 1
    0
Action 3
    1
Action 0
    0
Action 0
    0
Action 2
    0
Action 3
    0
Action 1
    1
Action 1
    0
Action 3
I am outside
BOUNCIN!!1
    3
Action 0
    3
Action 3
    0
Action 1
    0
Action 3
I am outside
BOUNCIN!!1
    1
Action 0
    1
Action 1
    0
Action 2
    1
Action 0
    3
Action 0
    5
Action 2
    4
Action 1
    4
Action 3
    0
Action 3
    1
Action 0
I am outside
BOUNCIN!!1
    -20
Action 3
I am outside
BOUNCIN!!1
    -20
Action 2
    0
Action 2
    2
Action 2
I am outside
BOUNCIN!!1
    

KeyboardInterrupt: 

In [None]:
start = time.time()
print(env.state)
print(time.time()- start)

In [130]:
env.reset()

(24, 22, 29, 0, 0)

In [99]:
q_table.val_table

array([[0.    , 0.    , 0.    , 0.8   ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.8   , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.8   ],
       [0.    , 0.    , 0.8   , 0.    ],
       [0.    , 0.608 , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 1.6   , 0.    ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.4416, 0.    , 0.    , 0.8   ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.8   , 0.    , 0.    ],
       [0.    , 0.    , 2.4   , 0.    ],
       [0.    , 0.8   , 0.    , 0.    ],
       [1.6   , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 4.    ],
       [0.    , 4.    , 0.    , 0.    ],
       [4.    , 0.    , 0.    , 0.    ],
       [0.    , 2.4   , 0.    , 0.    ],
       [0.    , 