In [56]:
# fundemental modules
import gymnasium as gym
import highway_env
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import gc
import time
import pprint
from tqdm.notebook import trange
import sys

In [57]:
# display visuals 
from utils import record_videos, show_videos

In [58]:
# deep learning modules
from keras.optimizers import Adam
from keras.layers import Dense, Conv2D, MaxPool2D, Flatten
from keras.models import Sequential
from keras.optimizers import Adam
import tensorflow as tf
import keras.backend as K
from keras.utils import plot_model
tf.random.set_seed(43)


In [59]:
# PER

class ReplayBuffer(object):
    """
    * init the values
    * for DQN actions are discrete
    """
    def __init__(self, max_size, min_size, input_shape, n_actions, discrete=True):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.min_size = min_size
        self.discrete = discrete
        self.index = 0
        
        self.state_memory = np.zeros((self.mem_size, *input_shape), dtype=np.float16)
        self.new_state_memory = np.zeros((self.mem_size, *input_shape), dtype=np.float16)
        dtype = np.int8 if self.discrete else np.float16
        self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype)
        self.reward_memory = np.zeros(self.mem_size, dtype = np.float16)
        self.terminal_memory = np.zeros(self.mem_size)
        self.priorities = np.zeros(self.mem_size, dtype=np.float32)

    def store_transition(self, state, action, reward, state_, done):

        index = self.mem_cntr % self.mem_size                
        self.state_memory[index] = state
        self.new_state_memory[index] = state_

        #* store one hot encoding of actions, if appropriate
        if self.discrete:
            #* Create an zeros-array size of the number of actions
            actions = np.zeros(self.action_memory.shape[1])
            #* Make 1 the value of performed action
            actions[action] = 1.0
            #* Store in action memory
            self.action_memory[index] = actions
        else:
            self.action_memory[index] = action

        #* store reward and if it's terminal info 
        self.reward_memory[index] = reward
        #* we send inverse done info!!!
        self.terminal_memory[index] = 1 - done
        self.priorities[index] = max((self.priorities.max()), 1.0)
        self.mem_cntr +=1
        self.index = self.mem_cntr

    def get_probabilities(self, priority_scale):
        scaled_priorities = np.array(self.priorities) ** priority_scale
        sample_probabilities = scaled_priorities / sum(scaled_priorities)
        return sample_probabilities
        
    def get_importance(self, probabilities):
        importance = 1/(self.mem_cntr) * 1/probabilities
        importance_normalized = importance / max(importance)
        return importance_normalized

    def sample_buffer(self, batch_size, priority_scale=1.0):
        
        if self.mem_cntr >= self.mem_size:
            self.index = self.mem_size
            
        sample_size = batch_size
        sample_probs = self.get_probabilities(priority_scale)
        sample_indices = random.choices(range(self.index), k=sample_size, weights=sample_probs[:self.index])

        states = self.state_memory[sample_indices]
        actions = self.action_memory[sample_indices]
        rewards = self.reward_memory[sample_indices]
        states_ = self.new_state_memory[sample_indices]
        terminal = self.terminal_memory[sample_indices]

        # samples = np.array(self.buffer)[sample_indices]
        importance = self.get_importance(sample_probs[sample_indices])
        return states, actions, rewards, states_, terminal, sample_indices

    def set_priorities(self, indices, errors, offset=0.1):
        for i,e in zip(indices, errors):
            error = abs(e) + offset
            clipped_error = np.minimum(error, 1.0)
            self.priorities[i] = clipped_error

In [60]:
#DDQN agent

class DDQNAgent:

    def __init__(self, alpha, gamma, epsilon, obs_shape,
                 batch_size, epsilon_dec, epsilon_end, mem_size, 
                 min_mem_size, learning_rate, replace_target):

        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_dec = epsilon_dec
        self.epsilon_end = epsilon_end
        self.batch_size = batch_size
        self.mem_size = mem_size
        self.min_mem_size = min_mem_size
        self.replace_target = replace_target
        self.obs_shape = obs_shape
        self.learning_rate = learning_rate

        self.discrete_action_space = np.array([-1.0, -0.75, -0.5, -0.25, 0.0, 0.25, 0.5, 0.75, 1.0])
        self.n_actions = len(self.discrete_action_space)
        self.action_space = [i for i in range(self.n_actions)]

        self.memory = ReplayBuffer(max_size=self.mem_size, min_size=self.min_mem_size,input_shape=self.obs_shape,
                             n_actions=self.n_actions,discrete=True)
                        
        self.q_eval = self._make_model()
        self.q_target = self._make_model()      #we keep a target model which we update every K timesteps
        self.q_eval.build(input_shape=(None, 128, 64, 4)  )  
        self.q_eval.summary()    
        plot_model(self.q_eval, to_file='model_ddqn.png')

    def _make_model(self):
        
        model = Sequential()
        model.add(Conv2D(32, 8, strides=2, activation="relu"))
        model.add(Conv2D(64, 4, strides=2, activation="relu"))
        model.add(Conv2D(64, 3, strides=1, activation="relu"))
        model.add(Flatten())
        # model.add( Dense(256, input_dim = self.obs_shape[0], activation='relu') )
        model.add( Dense(512, activation='relu') )
        model.add( Dense( self.n_actions))
        model.compile(loss='mse',optimizer= Adam(learning_rate = self.learning_rate),metrics=["accuracy"]) # type: ignore
 
        return model

    def epsilon_decay(self):
        self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > self.epsilon_end \
        else self.epsilon_end

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self):
        self.q_target.set_weights(self.q_eval.get_weights())
        
    def get_action(self, observation):

        if np.random.random() > self.epsilon: # type: ignore
    
            # observation = tf.convert_to_tensor(observation, dtype = tf.float16)

            qs_= self.q_eval.predict(observation)
            action_index = np.argmax(qs_)
            action = self.discrete_action_space[action_index]
        else:
            action_index = np.random.randint(0, self.n_actions)
            action = self.discrete_action_space[action_index]
        
        return action, action_index

    def train(self):

        if (self.memory.mem_cntr) < self.min_mem_size:
            return
        #* and ELSE:
        #* sample minibatch and get states vs..
        state, action, reward, new_state, done, sample_indices = \
                            self.memory.sample_buffer(self.batch_size)

        action_values = np.array(self.action_space, dtype=np.int8)
        action_indices = np.dot(action, action_values)

        # state = tf.convert_to_tensor(state, dtype = tf.float16)
        # new_state = tf.convert_to_tensor(new_state, dtype = tf.float16)
        # reward = tf.convert_to_tensor(reward, dtype = tf.float16)
        # done = tf.convert_to_tensor(done)
        # action_indices = tf.convert_to_tensor(action_indices, dtype=np.int8)
        
        #* get the q values of current states by main network
        q_pred = self.q_eval.predict(state)

        #! for abs error
        target_old = np.array(q_pred)

        #* get the q values of next states by target network
        q_next = self.q_target.predict(new_state) #! target_val

        #* get the q values of next states by main network
        q_eval = self.q_eval.predict(new_state) #! target_next

        #* get the actions with highest q values
        max_actions = np.argmax(q_eval, axis=1)

        #* we will update this dont worry
        q_target = q_pred

        batch_index = np.arange(self.batch_size, dtype=np.int32)

        #* new_q = reward + DISCOUNT * max_future_q
        q_target[batch_index, action_indices] = reward + \
                    self.gamma*q_next[batch_index, max_actions.astype(int)]*done

        #* error
        error = target_old[batch_index, action_indices]-q_target[batch_index, action_indices]
        self.memory.set_priorities(sample_indices, error)

        #* now we fit the main model (q_eval)
        _ = self.q_eval.fit(state, q_target, verbose='auto')

        #* If counter reaches set value, update target network with weights of main network
        #* it will update it at the very beginning also
        if self.memory.mem_cntr & self.replace_target == 0:
            self.update_network_parameters()
            print("Target Updated")

        gc.collect()
        K.clear_session()
        self.epsilon_decay()

    def save_model(self, episode):
        print("-----saving models------")
        self.q_eval.save_weights(f"weights/ddqn/q_net-{episode}.h5")
        # self.q_target.save_weights(self.network.checkpoint_file)

    def load_model(self):
        print("-----loading models------")
        self.q_eval.load_weights("q_net.h5")
        self.update_network_parameters()

In [61]:
def observation_shape(observation):
    return np.array(np.transpose(observation, (1, 2, 0)))

In [62]:
# envirenment config

env = gym.make('racetrack-v0', render_mode='rgb_array')
env.configure({
    'action': {'lateral': True,
            'longitudinal': False,
            'target_speeds': [0, 5],
            'type': 'ContinuousAction'},
        "observation": {
            "type": "GrayscaleObservation",
            "observation_shape": (128, 64),
            "stack_size": 4,
            "weights": [0.2989, 0.5870, 0.1140],  # weights for RGB conversion
            "scaling": 1.75,    },
    "other_vehicles": 1,
    'show_trajectories': False,
    'collision_reward': -1,
    'normalize_reward': True,
    
})

  logger.warn(


In [63]:
# prints env configs
#* obs is flattened to 1D array for nn

pprint.pprint(env.config)
(obs, info), done = env.reset(), False
obs = observation_shape(obs) 
print("Environment is setted up.")

{'action': {'lateral': True,
            'longitudinal': False,
            'target_speeds': [0, 5],
            'type': 'ContinuousAction'},
 'action_reward': -0.3,
 'centering_position': [0.5, 0.5],
 'collision_reward': -1,
 'controlled_vehicles': 1,
 'duration': 300,
 'lane_centering_cost': 4,
 'lane_centering_reward': 1,
 'manual_control': False,
 'normalize_reward': True,
 'observation': {'observation_shape': (128, 64),
                 'scaling': 1.75,
                 'stack_size': 4,
                 'type': 'GrayscaleObservation',
                 'weights': [0.2989, 0.587, 0.114]},
 'offscreen_rendering': False,
 'other_vehicles': 1,
 'other_vehicles_type': 'highway_env.vehicle.behavior.IDMVehicle',
 'policy_frequency': 5,
 'real_time_rendering': False,
 'render_agent': True,
 'scaling': 5.5,
 'screen_height': 600,
 'screen_width': 600,
 'show_trajectories': False,
 'simulation_frequency': 15}
Environment is setted up.


  logger.warn(


In [64]:
# agent config

agent = DDQNAgent(alpha=0.001, gamma=0.9, epsilon=1.0, obs_shape=obs.shape,
                  batch_size=64, epsilon_dec=0.999, epsilon_end=0.05, mem_size=20000,
                  min_mem_size=100, replace_target=100, learning_rate=0.001)

print("Agent is initialized.")


Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_25 (Conv2D)          (None, 61, 29, 32)        8224      
                                                                 
 conv2d_26 (Conv2D)          (None, 29, 13, 64)        32832     
                                                                 
 conv2d_27 (Conv2D)          (None, 27, 11, 64)        36928     
                                                                 
 flatten_6 (Flatten)         (None, 19008)             0         
                                                                 
 dense_12 (Dense)            (None, 512)               9732608   
                                                                 
 dense_13 (Dense)            (None, 9)                 4617      
                                                                 
Total params: 9,815,209
Trainable params: 9,815,209
N

In [65]:
#* basic params for training

best_score = -1000.0
score_history = []


In [None]:
# main loop

#TODO-1: Parameters Tuning

#TODO-6: Actions Shaping

#TODO-7: Pruning

#TODO-8: Tensorboard

env = record_videos(env)

for episode in trange(2000, desc='Test episodes'):
        (observation, info), done = env.reset(), False
        observation = observation_shape(observation)


        done_ = False
        score = 0
        step = 0
        truncated = False
        # env.render()
        while not done_:
            action, action_index = agent.get_action(observation)
            new_observation, reward, done, truncated, info = env.step(action=[action])
            new_observation = observation_shape(new_observation)

            if info["crashed"] == True or info["rewards"]["on_road_reward"] == False or truncated == True:
                done_ = True
                reward = -1.0
            else: done_ = False

            score += reward

            agent.remember(state=observation, action=action_index, done=done_,
                            reward=reward, new_state=new_observation)
            agent.train()

            observation = new_observation
            

        score_history.append(score)
        avg_score = np.mean(score_history[-100:])

        if avg_score > best_score:
            best_score = avg_score
            agent.save_model(episode)

        print('episode ', episode, 'score %.1f' % score,
               'avg score %.1f' % avg_score)
        print("Exp- value:", agent.epsilon)
        time.sleep(1)

env.close()
# show_videos()

In [None]:
h = np.save("score_history", np.array(score_history))

In [None]:
print(best_score)
print(avg_score)

In [None]:
#Evaluating
agent.load_model()

env = record_videos(env)

for episode in trange(10, desc='Test episodes'):
        (observation, info), done = env.reset(), False
        observation = np.array(observation.flatten())

        done_ = False
        score = 0
        step = 0
        # env.render()
        while True:
            action, action_index = agent.get_action(observation.reshape((1,observation.shape[0])))
            new_observation, reward, done, truncated, info = env.step(action=[action])
            new_observation = np.array(new_observation.flatten())

            if info["crashed"] == True or info["rewards"]["on_road_reward"] == False:
                done_ = True
                reward = -1.0
            else: done_ = False

            score += reward


            observation = new_observation

            if done or done_:
                break


        print("Episode: ", episode)
        print("Score: ", score)

env.close()