In [45]:
# fundemental modules
import gymnasium as gym
import highway_env
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import gc
import time
import pprint
from tqdm.notebook import trange
import sys

In [46]:
# display visuals 
from utils import record_videos, show_videos

In [47]:
# deep learning modules
from keras.optimizers import Adam
from keras.layers import Dense, Flatten
from keras.models import Sequential
from keras.optimizers import Adam
import tensorflow as tf
import keras.backend as K
from keras.utils import plot_model
tf.random.set_seed(43)
import keras

In [48]:
class ReplayBuffer:
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_cnt = 0
        self.state_memory = np.zeros((self.mem_size,*input_shape), dtype=np.float16)
        self.new_state_memory = np.zeros((self.mem_size, *input_shape),dtype=np.float16)
        self.action_memory = np.zeros((self.mem_size, n_actions),dtype=np.float16)
        self.reward_memory = np.zeros(self.mem_size,dtype=np.float16)
        self.terminal_memory = np.zeros(self.mem_size, dtype = bool)
        
    def store_transition(self, state, action, reward,
                         new_state, done):
        
        index = self.mem_cnt % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = new_state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = done

        self.mem_cnt += 1

    def sample_bufer(self, batch_size):

        max_mem = min(self.mem_cnt, self.mem_size)
        batch = np.random.choice(max_mem, batch_size, replace=False)

        states = self.state_memory[batch]
        states_= self.new_state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        dones = self.terminal_memory[batch]

        return states, actions, rewards, states_, dones
        

In [49]:
class CriticNetwork(keras.Model):
    def __init__(self, fc1_dims = 512, fc2_dims = 512,
                 name = 'critic', chkpt_dir = 'tmp/ddpg'):
        super(CriticNetwork, self).__init__()

        self.model_name = name
        self.checkpoint_dir = chkpt_dir
        self.checkpoint_file = os.path.join(self.checkpoint_dir,
                                            self.model_name + '_ddpg.h5')
        
        
        # self.conv1 = Conv2D(24, 8, strides=2, activation="relu", padding="same")
        # self.conv2 = Conv2D(32, 8, strides=1, activation="relu", padding="same")
        self.flatten = Flatten()

        self.fc1 = Dense(units=512, activation='relu')
        self.fc2 = Dense(units=512, activation='relu')
        self.q = Dense(1,activation=None)

    def call(self, state, action):

        state_value = self.fc1(tf.concat([state, action], axis = 1))
        # conv_b = self.conv2(conv_a)
        # flat = self.flatten(state_value)

        action_value = self.fc2(state_value)
        q = self.q(action_value)

        return q
    
class ActorNetwork(keras.Model):
    def __init__(self, n_actions=1, fc1_dims = 512, fc2_dims = 512,
                 name = 'actor', chkpt_dir = 'tmp/ddpg'):
        super(ActorNetwork, self).__init__()

        self.n_actions = n_actions
        self.model_name = name
        self.checkpoint_dir = chkpt_dir
        self.checkpoint_file = os.path.join(self.checkpoint_dir,
                                            self.model_name + '_ddpg.h5')
        
        # self.conv1 = Conv2D(24, 8, strides=2, activation="relu", padding="same")
        # self.conv2 = Conv2D(32, 8, strides=1, activation="relu", padding="same")
        # self.flatten = Flatten()
        self.fc1 = Dense(units=512, activation='relu')
        self.fc2 = Dense(units=512, activation='relu')
        self.mu = Dense(self.n_actions ,activation='tanh')

    def call(self, state):

        # conv_a = self.conv1(state)
        # conv_b = self.conv2(conv_a)
        # flat = self.flatten(conv_b)
        prob = self.fc1(state)
        prob = self.fc2(prob)

        mu = self.mu(prob)

        return mu

In [50]:
class Agent:
    def __init__(self, input_dims, alpha = 0.001,
                 beta = 0.002, gamma = .99,
                 n_actions = 1, max_size = 10000,
                 tau = 0.003, batch_size = 64, noise =0.3):
        
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.noise = noise
        self.max_action = 1.0
        self.min_action = -1.0

        self.actor = ActorNetwork(n_actions = n_actions, name = 'actor')
        self.critic = CriticNetwork(name = 'critic')
        self.target_actor = ActorNetwork(n_actions=n_actions,
                                         name = 'target_actor')
        self.target_critic = CriticNetwork(name = 'target_critic')

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic.compile(optimizer=Adam(learning_rate=beta))
        self.target_actor.compile(optimizer=Adam(learning_rate=alpha))
        self.target_critic.compile(optimizer=Adam(learning_rate=beta))

        self.update_network_parameters(tau=1)

    def update_network_parameters(self, tau=None):

        if tau is None:
            tau = self.tau

        weights = []
        targets = self.target_actor.weights
        for i, weight in enumerate(self.actor.weights):
            weights.append(weight*tau + targets[i]*(1-tau))
        self.target_actor.set_weights(weights)

        weights = []
        targets = self.target_critic.weights
        for i,weight in enumerate(self.critic.weights):
            weights.append(weight*tau + targets[i]*(1-tau))
        self.target_critic.set_weights(weights)

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state,action,reward,new_state,done)

    def save_model(self, episode):
        print("-----saving models------")
        self.actor.save_weights(f"weights/ddpg/ddpg_actor-{episode}.h5")
        self.critic.save_weights(f"weights/ddpg/ddpg_critic-{episode}.h5")
        # self.target_actor.save_weights(self.target_actor.checkpoint_file)
        # self.target_critic.save_weights(self.target_critic.checkpoint_file)

    def load_model(self):
        pass

    def choose_action(self,observation, evaluate=False):
        state = tf.convert_to_tensor([observation], dtype=tf.float32)
        actions = self.actor(state)
        if not evaluate:
            actions += tf.random.normal(shape =[self.n_actions],
                                         mean= 0.0, stddev = self.noise) # type: ignore

        actions = tf.clip_by_value(actions, self.min_action, self.max_action)

        return actions[0] # type: ignore
    
    def learn(self):
        if self.memory.mem_cnt < self.batch_size:
            return
        state, action, reward, new_state, done = \
        self.memory.sample_bufer(self.batch_size)

        states = tf.convert_to_tensor(state, dtype=np.float32)
        states_ = tf.convert_to_tensor(new_state, dtype = np.float32)
        actions = tf.convert_to_tensor(action, dtype=np.float32)
        rewards = tf.convert_to_tensor(reward, dtype=np.float32)

        with tf.GradientTape() as tape:
            target_actions = self.target_actor(states_)
            critic_value_ = tf.squeeze(self.target_critic(
                states_, target_actions),1)
            
            critic_value = tf.squeeze(self.critic(states,actions),1)
            target = reward + self.gamma*critic_value_*(1-done)

            critic_loss = keras.losses.MSE(target, critic_value)

        critic_network_gradient = tape.gradient(critic_loss,
                                                self.critic.trainable_variables)
        self.critic.optimizer.apply_gradients(zip(
            critic_network_gradient, self.critic.trainable_variables))
        
        with tf.GradientTape() as tape:
            new_policy_actions = self.actor(states)
            actor_loss = -self.critic(states, new_policy_actions)
            actor_loss = tf.math.reduce_mean(actor_loss)

        actor_network_gradient = tape.gradient(actor_loss,
                                               self.actor.trainable_variables)
        
        self.actor.optimizer.apply_gradients(zip(
            actor_network_gradient, self.actor.trainable_variables))
        
        self.update_network_parameters()

        gc.collect()
        K.clear_session()

In [51]:
# envirenment config

env = gym.make('racetrack-v0', render_mode='rgb_array')
env.configure({
    'action': {'lateral': True,
            'longitudinal': False,
            'target_speeds': [0, 3],
            'type': 'ContinuousAction'},
    "observation": {
        "type": "Kinematics",
        "vehicles_count": 2,
        "features": ["presence", "x", "y", "vx", "vy", "cos_h", "sin_h",
                     "heading", "long_off", "lat_off", "ang_off"],
    },
    'show_trajectories': True,
    "other_vehicles": 1

})

  logger.warn(


In [52]:
# prints env configs
#* obs is flattened to 1D array for nn

pprint.pprint(env.config)
(obs, info), done = env.reset(), False
obs = np.array(obs.flatten())
print("Environment is setted up.")

{'action': {'lateral': True,
            'longitudinal': False,
            'target_speeds': [0, 3],
            'type': 'ContinuousAction'},
 'action_reward': -0.3,
 'centering_position': [0.5, 0.5],
 'collision_reward': -1,
 'controlled_vehicles': 1,
 'duration': 300,
 'lane_centering_cost': 4,
 'lane_centering_reward': 1,
 'manual_control': False,
 'observation': {'features': ['presence',
                              'x',
                              'y',
                              'vx',
                              'vy',
                              'cos_h',
                              'sin_h',
                              'heading',
                              'long_off',
                              'lat_off',
                              'ang_off'],
                 'type': 'Kinematics',
                 'vehicles_count': 2},
 'offscreen_rendering': False,
 'other_vehicles': 1,
 'other_vehicles_type': 'highway_env.vehicle.behavior.IDMVehicle',
 'policy_frequency': 

  logger.warn(


In [53]:
agent = Agent(input_dims=obs.shape,
                  n_actions=1)

In [54]:
#* basic params for training

best_score = -1000.0
score_history = []

In [55]:
for episode in trange(2000, desc='Test episodes'):
        score = 0
        (observation, info), done = env.reset(), False
        observation = np.array(observation.flatten())

        done_ = False
        score = 0
        step = 0
        truncated = False

        while not done_:
            action = agent.choose_action(observation, evaluate = False)
            act = np.array(action)
            new_observation, reward, done, truncated, info = env.step(act) # type: ignore

            new_observation = np.array(new_observation.flatten())

            if info["crashed"] == True or info["rewards"]["on_road_reward"] == False or truncated == True:
                done_ = True
                reward = -10.0
            else: done_ = False

            score += reward

            agent.remember(observation,action,reward,new_observation,done)

            agent.learn()
            observation = new_observation
            step +=1

        score_history.append(score)
        avg_score = np.mean(score_history[-100:])

        if avg_score > best_score:
            best_score = avg_score
            agent.save_model(episode)

        print('episode ', episode, 'score %.1f' % score,
               'avg score %.1f' % avg_score, 'best_score %.1f' % best_score)
        # time.sleep(1)

env.close()





Test episodes:   0%|          | 0/2000 [00:00<?, ?it/s]

-----saving models------
episode  0 score -3.3 avg score -3.3 best_score -3.3
episode  1 score -8.1 avg score -5.7 best_score -3.3
episode  2 score -4.6 avg score -5.3 best_score -3.3
episode  3 score -7.6 avg score -5.9 best_score -3.3
episode  4 score -6.4 avg score -6.0 best_score -3.3
episode  5 score -6.5 avg score -6.1 best_score -3.3
episode  6 score -8.8 avg score -6.5 best_score -3.3
episode  7 score -5.0 avg score -6.3 best_score -3.3
episode  8 score -8.5 avg score -6.5 best_score -3.3
episode  9 score -8.3 avg score -6.7 best_score -3.3
episode  10 score -6.4 avg score -6.7 best_score -3.3
episode  11 score -8.2 avg score -6.8 best_score -3.3
episode  12 score -9.1 avg score -7.0 best_score -3.3
episode  13 score -8.2 avg score -7.1 best_score -3.3
episode  14 score -9.1 avg score -7.2 best_score -3.3
episode  15 score -8.2 avg score -7.3 best_score -3.3
episode  16 score -8.1 avg score -7.3 best_score -3.3
episode  17 score -9.1 avg score -7.4 best_score -3.3
episode  18 s

KeyboardInterrupt: 