In [18]:
# fundemental modules
import gymnasium as gym
import highway_env
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import gc
import time
import pprint
from tqdm.notebook import trange
import sys

In [19]:
# display visuals 
from utils import record_videos, show_videos

In [20]:
# deep learning modules
from keras.optimizers import Adam
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
import tensorflow as tf
import keras.backend as K
from keras.utils import plot_model
tf.random.set_seed(43)


In [21]:
# PER

class ReplayBuffer(object):
    """
    * init the values
    * for DQN actions are discrete
    """
    def __init__(self, max_size, min_size, input_shape, n_actions, discrete=True):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.min_size = min_size
        self.discrete = discrete
        self.index = 0
        
        self.state_memory = np.zeros((self.mem_size, *input_shape), dtype=np.float16)
        self.new_state_memory = np.zeros((self.mem_size, *input_shape), dtype=np.float16)
        dtype = np.int8 if self.discrete else np.float16
        self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype)
        self.reward_memory = np.zeros(self.mem_size, dtype = np.float16)
        self.terminal_memory = np.zeros(self.mem_size)
        self.priorities = np.zeros(self.mem_size, dtype=np.float32)

    def store_transition(self, state, action, reward, state_, done):

        index = self.mem_cntr % self.mem_size                
        self.state_memory[index] = state
        self.new_state_memory[index] = state_

        #* store one hot encoding of actions, if appropriate
        if self.discrete:
            #* Create an zeros-array size of the number of actions
            actions = np.zeros(self.action_memory.shape[1])
            #* Make 1 the value of performed action
            actions[action] = 1.0
            #* Store in action memory
            self.action_memory[index] = actions
        else:
            self.action_memory[index] = action

        #* store reward and if it's terminal info 
        self.reward_memory[index] = reward
        #* we send inverse done info!!!
        self.terminal_memory[index] = 1 - done
        self.priorities[index] = max((self.priorities.max()), 1.0)
        self.mem_cntr +=1
        self.index = self.mem_cntr

    def get_probabilities(self, priority_scale):
        scaled_priorities = np.array(self.priorities) ** priority_scale
        sample_probabilities = scaled_priorities / sum(scaled_priorities)
        return sample_probabilities
        
    def get_importance(self, probabilities):
        importance = 1/(self.mem_cntr) * 1/probabilities
        importance_normalized = importance / max(importance)
        return importance_normalized

    def sample_buffer(self, batch_size, priority_scale=1.0):
        
        if self.mem_cntr >= self.mem_size:
            self.index = self.mem_size
            
        sample_size = batch_size
        sample_probs = self.get_probabilities(priority_scale)
        sample_indices = random.choices(range(self.index), k=sample_size, weights=sample_probs[:self.index])

        states = self.state_memory[sample_indices]
        actions = self.action_memory[sample_indices]
        rewards = self.reward_memory[sample_indices]
        states_ = self.new_state_memory[sample_indices]
        terminal = self.terminal_memory[sample_indices]

        # samples = np.array(self.buffer)[sample_indices]
        importance = self.get_importance(sample_probs[sample_indices])
        return states, actions, rewards, states_, terminal, sample_indices

    def set_priorities(self, indices, errors, offset=0.1):
        for i,e in zip(indices, errors):
            error = abs(e) + offset
            clipped_error = np.minimum(error, 1.0)
            self.priorities[i] = clipped_error

In [22]:
#DDQN agent

class DDQNAgent:

    def __init__(self, alpha, gamma, epsilon, obs_shape,
                 batch_size, epsilon_dec, epsilon_end, mem_size, 
                 min_mem_size, learning_rate, replace_target):

        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_dec = epsilon_dec
        self.epsilon_end = epsilon_end
        self.batch_size = batch_size
        self.mem_size = mem_size
        self.min_mem_size = min_mem_size
        self.replace_target = replace_target
        self.obs_shape = obs_shape
        self.learning_rate = learning_rate

        self.discrete_action_space = np.array([-1.0, -0.75, -0.5, -0.25, 0.0, 0.25, 0.5, 0.75, 1.0])
        self.n_actions = len(self.discrete_action_space)
        self.action_space = [i for i in range(self.n_actions)]

        self.memory = ReplayBuffer(max_size=self.mem_size, min_size=self.min_mem_size,input_shape=self.obs_shape,
                             n_actions=self.n_actions,discrete=True)
                        
        self.q_eval = self._make_model()
        self.q_target = self._make_model()      #we keep a target model which we update every K timesteps
        self.q_eval.summary()
        plot_model(self.q_eval, to_file='./model_ddqn.png')

    def _make_model(self):
        
        model = Sequential()
        model.add( Dense(256, input_dim = self.obs_shape[0], activation='relu') )
        model.add( Dense(256, activation='relu') )
        model.add( Dense( self.n_actions))
        model.compile(loss='mse',optimizer= Adam(learning_rate = self.learning_rate),metrics=["accuracy"]) # type: ignore
 
        return model

    def epsilon_decay(self):
        self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > self.epsilon_end \
        else self.epsilon_end

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self):
        self.q_target.set_weights(self.q_eval.get_weights())
        
    def get_action(self, observation):

        if np.random.random() > self.epsilon: # type: ignore
    
            # observation = tf.convert_to_tensor(observation, dtype = tf.float16)

            qs_= self.q_eval.predict(observation)
            action_index = np.argmax(qs_)
            action = self.discrete_action_space[action_index]
        else:
            action_index = np.random.randint(0, self.n_actions)
            action = self.discrete_action_space[action_index]
        
        return action, action_index

    def train(self):

        if (self.memory.mem_cntr) < self.min_mem_size:
            return
        #* and ELSE:
        #* sample minibatch and get states vs..
        state, action, reward, new_state, done, sample_indices = \
                            self.memory.sample_buffer(self.batch_size)

        action_values = np.array(self.action_space, dtype=np.int8)
        action_indices = np.dot(action, action_values)

        # state = tf.convert_to_tensor(state, dtype = tf.float16)
        # new_state = tf.convert_to_tensor(new_state, dtype = tf.float16)
        # reward = tf.convert_to_tensor(reward, dtype = tf.float16)
        # done = tf.convert_to_tensor(done)
        # action_indices = tf.convert_to_tensor(action_indices, dtype=np.int8)
        
        #* get the q values of current states by main network
        q_pred = self.q_eval.predict(state)

        #! for abs error
        target_old = np.array(q_pred)

        #* get the q values of next states by target network
        q_next = self.q_target.predict(new_state) #! target_val

        #* get the q values of next states by main network
        q_eval = self.q_eval.predict(new_state) #! target_next

        #* get the actions with highest q values
        max_actions = np.argmax(q_eval, axis=1)

        #* we will update this dont worry
        q_target = q_pred

        batch_index = np.arange(self.batch_size, dtype=np.int32)

        #* new_q = reward + DISCOUNT * max_future_q
        q_target[batch_index, action_indices] = reward + \
                    self.gamma*q_next[batch_index, max_actions.astype(int)]*done

        #* error
        error = target_old[batch_index, action_indices]-q_target[batch_index, action_indices]
        self.memory.set_priorities(sample_indices, error)

        #* now we fit the main model (q_eval)
        _ = self.q_eval.fit(state, q_target, verbose='auto')

        #* If counter reaches set value, update target network with weights of main network
        #* it will update it at the very beginning also
        if self.memory.mem_cntr & self.replace_target == 0:
            self.update_network_parameters()
            print("Target Updated")

        gc.collect()
        K.clear_session()
        self.epsilon_decay()

    def save_model(self, episode):
        print("-----saving models------")
        self.q_eval.save_weights(f"weights/ddqn/q_net-{episode}.h5")
        # self.q_target.save_weights(self.network.checkpoint_file)

    def load_model(self):
        print("-----loading models------")
        self.q_eval.load_weights("q_net.h5")
        self.update_network_parameters()

In [24]:
# envirenment config

env = gym.make('racetrack-v0', render_mode='rgb_array')
env.configure({
    'action': {'lateral': True,
            'longitudinal': False,
            'target_speeds': [0, 5],
            'type': 'ContinuousAction'},
    "observation": {
        "type": "Kinematics",
        "vehicles_count": 1,
        "features": ["presence", "x", "y", "vx", "vy", "cos_h", "sin_h",
                     "heading", "long_off", "lat_off", "ang_off"],
    },
    'show_trajectories': True,
    "other_vehicles": 1

})

In [25]:
# prints env configs
#* obs is flattened to 1D array for nn

pprint.pprint(env.config)
(obs, info), done = env.reset(), False
obs = np.array(obs.flatten())
print("Environment is setted up.")

{'action': {'lateral': True,
            'longitudinal': False,
            'target_speeds': [0, 5],
            'type': 'ContinuousAction'},
 'action_reward': -0.3,
 'centering_position': [0.5, 0.5],
 'collision_reward': -1,
 'controlled_vehicles': 1,
 'duration': 300,
 'lane_centering_cost': 4,
 'lane_centering_reward': 1,
 'manual_control': False,
 'observation': {'features': ['presence',
                              'x',
                              'y',
                              'vx',
                              'vy',
                              'cos_h',
                              'sin_h',
                              'heading',
                              'long_off',
                              'lat_off',
                              'ang_off'],
                 'type': 'Kinematics',
                 'vehicles_count': 1},
 'offscreen_rendering': False,
 'other_vehicles': 1,
 'other_vehicles_type': 'highway_env.vehicle.behavior.IDMVehicle',
 'policy_frequency': 

  logger.warn(f"{pre} is not within the observation space.")


In [26]:
# agent config

agent = DDQNAgent(alpha=0.001, gamma=0.9, epsilon=1.0, obs_shape=obs.shape,
                  batch_size=64, epsilon_dec=0.999, epsilon_end=0.05, mem_size=20000,
                  min_mem_size=100, replace_target=100, learning_rate=0.001)

print("Agent is initialized.")


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               5888      
                                                                 
 dense_1 (Dense)             (None, 256)               65792     
                                                                 
 dense_2 (Dense)             (None, 9)                 2313      
                                                                 
Total params: 73993 (289.04 KB)
Trainable params: 73993 (289.04 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.
Agent is initialized.


2023-10-17 16:06:50.906594: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-17 16:06:50.937298: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-17 16:06:50.937390: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [27]:
#* basic params for training

best_score = -1000.0
score_history = []


In [28]:
# main loop

#TODO-1: Parameters Tuning

#TODO-6: Actions Shaping

#TODO-7: Pruning

#TODO-8: Tensorboard

env = record_videos(env)

for episode in trange(1000, desc='Test episodes'):
        (observation, info), done = env.reset(), False
        observation = np.array(observation.flatten())


        done_ = False
        score = 0
        step = 0
        truncated = False
        # env.render()
        while not done_:
            action, action_index = agent.get_action(observation.reshape((1,observation.shape[0])))
            new_observation, reward, done, truncated, info = env.step(action=[action])
            new_observation = np.array(new_observation.flatten())

            if info["crashed"] == True or info["rewards"]["on_road_reward"] == False or truncated == True:
                done_ = True
                reward = -10.0
            else: done_ = False

            score += reward

            agent.remember(state=observation, action=action_index, done=done_,
                            reward=reward, new_state=new_observation)
            agent.train()

            observation = new_observation
            

        score_history.append(score)
        avg_score = np.mean(score_history[-100:])

        if avg_score > best_score:
            best_score = avg_score
            agent.save_model(episode)

        print('episode ', episode, 'score %.1f' % score,
               'avg score %.1f' % avg_score)
        print("Exp- value:", agent.epsilon)
        time.sleep(1)

env.close()
# show_videos()

  logger.warn(


Test episodes:   0%|          | 0/1000 [00:00<?, ?it/s]

  logger.warn(f"{pre} is not within the observation space.")


-----saving models------
episode  0 score -6.4 avg score -6.4
Exp- value: 1.0
episode  1 score -6.6 avg score -6.5
Exp- value: 1.0
episode  2 score -6.8 avg score -6.6
Exp- value: 1.0
episode  3 score -8.9 avg score -7.2
Exp- value: 1.0
-----saving models------
episode  4 score -1.4 avg score -6.0
Exp- value: 1.0
episode  5 score -8.4 avg score -6.4
Exp- value: 1.0
episode  6 score -8.2 avg score -6.7
Exp- value: 1.0
episode  7 score -5.9 avg score -6.6
Exp- value: 1.0
episode  8 score -5.9 avg score -6.5
Exp- value: 1.0
episode  9 score -7.2 avg score -6.6
Exp- value: 1.0
-----saving models------
episode  10 score 2.4 avg score -5.8
Exp- value: 1.0
episode  11 score -7.6 avg score -5.9
Exp- value: 1.0


2023-10-17 16:07:13.104185: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-10-17 16:07:13.781957: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7feffba60af0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-10-17 16:07:13.781970: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2023-10-17 16:07:13.786794: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-10-17 16:07:14.680783: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8902
2023-10-17 16:07:14.742582: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


episode  12 score -7.1 avg score -6.0
Exp- value: 0.996005996001
episode  13 score -4.2 avg score -5.9
Exp- value: 0.9851045463620021
episode  14 score -6.7 avg score -5.9
Exp- value: 0.9792086759647052
Target Updated
Target Updated
Target Updated
Target Updated
Target Updated
Target Updated
Target Updated
Target Updated
Target Updated
Target Updated
Target Updated
Target Updated
Target Updated
Target Updated
-----saving models------
episode  15 score 6.7 avg score -5.1
Exp- value: 0.9474065091141411
Target Updated
Target Updated
episode  16 score -6.1 avg score -5.2
Exp- value: 0.9407945259609451
episode  17 score -7.6 avg score -5.3
Exp- value: 0.9351638519212846
episode  18 score -5.2 avg score -5.3
Exp- value: 0.925854183751895
episode  19 score -4.8 avg score -5.3
Exp- value: 0.9157205572498949
episode  20 score -8.1 avg score -5.4
Exp- value: 0.9120631656822724
episode  21 score -5.7 avg score -5.4
Exp- value: 0.9047921471137096
episode  22 score -8.9 avg score -5.6
Exp- value: 0

ValueError: could not broadcast input array from shape (11,) into shape (22,)

In [14]:
h = np.save("score_history", np.array(score_history))

In [19]:
print(best_score)
print(avg_score)

27.02133133098047
1.974266679505516


In [13]:
#Evaluating
agent.load_model()

env = record_videos(env)

for episode in trange(10, desc='Test episodes'):
        (observation, info), done = env.reset(), False
        observation = np.array(observation.flatten())

        done_ = False
        score = 0
        step = 0
        # env.render()
        while True:
            action, action_index = agent.get_action(observation.reshape((1,observation.shape[0])))
            new_observation, reward, done, truncated, info = env.step(action=[action])
            new_observation = np.array(new_observation.flatten())

            if info["crashed"] == True or info["rewards"]["on_road_reward"] == False:
                done_ = True
                reward = -1.0
            else: done_ = False

            score += reward


            observation = new_observation

            if done or done_:
                break


        print("Episode: ", episode)
        print("Score: ", score)

env.close()

-----loading models------


  logger.warn(


Test episodes:   0%|          | 0/10 [00:00<?, ?it/s]

Episode:  0
Score:  1.3721311968406065
Episode:  1
Score:  5.0530271586894075
Episode:  2
Score:  6.577469628791425
Episode:  3
Score:  -0.05288229273136713
Episode:  4
Score:  0.775702045658262
Episode:  5
Score:  4.964922426753592
Episode:  6
Score:  9.828439108016884
Episode:  7
Score:  5.319425406038121
Episode:  8
Score:  2.6945582932489436
Episode:  9
Score:  1.8040322219631442
Moviepy - Building video /home/o/Documents/thesis/highway/videos/rl-video-episode-0.mp4.
Moviepy - Writing video /home/o/Documents/thesis/highway/videos/rl-video-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /home/o/Documents/thesis/highway/videos/rl-video-episode-0.mp4
Moviepy - Building video /home/o/Documents/thesis/highway/videos/rl-video-episode-0.mp4.
Moviepy - Writing video /home/o/Documents/thesis/highway/videos/rl-video-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /home/o/Documents/thesis/highway/videos/rl-video-episode-0.mp4
Moviepy - Building video /home/o/Documents/thesis/highway/videos/rl-video-episode-0.mp4.
Moviepy - Writing video /home/o/Documents/thesis/highway/videos/rl-video-episode-0.mp4



                                                  

Moviepy - Done !
Moviepy - video ready /home/o/Documents/thesis/highway/videos/rl-video-episode-0.mp4
Moviepy - Building video /home/o/Documents/thesis/highway/videos/rl-video-episode-0.mp4.
Moviepy - Writing video /home/o/Documents/thesis/highway/videos/rl-video-episode-0.mp4



                                                   

Moviepy - Done !
Moviepy - video ready /home/o/Documents/thesis/highway/videos/rl-video-episode-0.mp4
