In [2]:
#import packages
import tensorflow as tf
import tensorflow.compat.v1 as tfc
import numpy as np
import random
import functools
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


In [3]:
class DQfDAgent:
    def __init__(self, env, configuration,trajectories =None):
        self.session = tfc.InteractiveSession()
        self.conf = configuration
        self.replay = Memory(capacity=self.conf.replay_buffer_size, permanent_data=len(trajectories))
        self.demos = Memory(capacity=self.conf.demo_buffer_size, permanent_data=self.conf.demo_buffer_size)
        self.include_trajectories(trajectories= trajectories)  # add demo data to both demo_memory & replay_memory
        self.step = 0
        self.eps = self.conf.eps_init
        self.state_no = env.observation_space.shape[0]
        self.action_no = env.action_space.n
        

    #function to include transitions to demonstration memory
    def include_trajectories(self, trajectories):
        for tra in trajectories :
            self.demos.store(np.array(tra, dtype=object))
            self.replay.store(np.array(tra, dtype=object))


#function for greedy policy
    def greedy_act(self, current_state, model):
        if np.random.random() < self.eps:
            return np.random.randint(0, self.action_no - 1)
        return np.argmax(model.predict(current_state)[0])
    


#function for deep neural network layers
    def neural_net_layers(self, no_units1, no_units2, reg=None):
        model = Sequential()
        model.add(Dense(no_units1, input_dim = self.state_no, kernel_regularizer= reg))
        model.add(Dense(no_units2, kernel_regularizer= reg))
        model.add(Dense(self.action_no))
        model.compile(loss = "mean_squared_error", optimizer = Adam(lr = self.conf.alpha))
        
        return model
        
    #function for selecting network
    def select_q(self):
        reg = tf.keras.regularizers.l2(l=0.2)  
        return self.neural_net_layers(24, 48, reg)
       
    #function for evaluating the network 
    def eval_q(self):
        return self.neural_net_layers(24, 48)
    
    def train_network(self, train =False, update=True):
       
        self.step= self.step + 1

        actual_mem = self.demos if train else self.replay
        minibatch = actual_mem.sample(self.conf.minibatch)
      
        np.random.shuffle(minibatch)
        current_state_batch = [data[0] for data in minibatch]
        action_batch = [data[1] for data in minibatch]
        reward_batch = [data[2] for data in minibatch]
        new_state_batch = [data[3] for data in minibatch]
        done_batch = [data[4] for data in minibatch]
        demos_data = [data[5] for data in minibatch]
        nth_step_reward_batch = [data[6] for data in minibatch]
        nth_step_state_batch = [data[7] for data in minibatch]
        nth_step_done_batch = [data[8] for data in minibatch]
        actual_no = [data[9] for data in minibatch]

        # provide for placeholderï¼Œcompute first
        select_q = self.select_q()
        eval_q = self.eval_q()
        n_step_select_q = self.select_q()
        n_step_eval_q = self.eval_q()

        y_batch = np.zeros((self.conf.minibatch, self.action_no))
        n_step_y_batch = np.zeros((self.conf.minibatch, self.action_no))
        for i in range(self.conf.minibatch):
            temp = select_q.predict(current_state_batch[i].reshape((-1, self.state_no)))[0]
            temp_0 = temp
            # add 1-step reward
            action = self.greedy_act(current_state_batch[i].reshape((-1, self.state_no)), select_q)
            new_state_batch = new_state_batch[i].reshape(1,4)
            new_q = max(eval_q.predict(new_state_batch)[0])
            temp[action_batch[i]] = reward_batch[i] + (1 - int(done_batch[i])) * self.conf.gamma * new_q
            y_batch[i] = temp
            # add n-step reward
            action = self.greedy_act(nth_step_state_batch[i],select_q)
            n_step_new_q = max(n_step_eval_q.predict(new_state_batch[i].reshape(1,4))[0])
            q_nth_step = (1 - int(nth_step_done_batch[i])) * self.conf.gamma**actual_no[i] *n_step_new_q
            temp_0[action_batch[i]] = nth_step_reward_batch[i] + q_nth_step
            n_step_y_batch[i] = temp_0
        
        return y_batch,n_step_y_batch,current_state_batch, action_batch,demos_data


    def loss_lin(self, ae, a):
        return 0.0 if ae == a else 0.75

    def loss_selec(self, select_q, action_batch, demo_data):
        inp = 0.0
        for i in range(self.conf.minibatch):
            ae = action_batch[i]
            max_val = float("-inf")
            for act in range(self.action_no):
                max_val = max(select_q[i][act] + self.loss_lin(ae, act), max_val)
            inp += demo_data[i] * (max_val - select_q[i][ae])
        return inp

   
    def loss(self, select_q, y_batch, n_step_y_batch,action_batch,demo_data, weights):
        loss_dq = tf.math.reduce_mean(tf.math.squared_difference(select_q, y_batch))
        lloss_dq = tf.math.reduce_mean(tf.math.squared_difference(select_q, n_step_y_batch))
        loss_inp = self.loss_selec(y_batch, action_batch, demo_data)
        loss_l2 = tf.math.reduce_sum([tf.math.reduce_mean(reg_l) for reg_l in tfc.get_collection(tfc.GraphKeys.REGULARIZATION_LOSSES)])
        return weights * tf.math.reduce_sum([l * lam for l, lam in zip([loss_dq, lloss_dq, loss_inp, loss_l2], self.conf.lamb)])




    def perceive(self, trajectory):
        self.replay.store(np.array(trajectory))
        
        if self.replay.full==True:
            self.eps = max(self.conf.eps_fin, self.eps * self.conf.decay)

    
    
    #function for training 
    def train_ahead(self):
        for tr in range(self.conf.pretraining):
            self.train_network(train=True)
            if tr % 200 == 0 and tr > 0:
                print('Training step with expert demonstrations: {}'.format(tr))
        self.step= 0
        print('Finished demo training')
    
    
    

In [4]:
class Memory(object):
    def __init__(self, capacity, permanent_data):
        self.capacity = capacity
        self.permanent_data = permanent_data
        assert 0<=self.permanent_data <= self.capacity
        self.mem = []
        self.pos = 0
        self.full = False
    
    def store(self, transition):
        if len(self.mem) < self.capacity:
            self.mem.append(None)
        self.mem[self.pos] = transition
        self.pos = (self.pos + 1) % self.capacity
        if self.permanent_data >= self.capacity:
            self.full = True
        
    def sample(self, batch_size):
        return random.sample(self.mem, batch_size)
    
    def __len__(self):
        return len(self.mem)
    
    