In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, Model
import random
from collections import deque
import os
import tensorflow.summary as tf_summary
import time
import datetime
from sklearn.model_selection import train_test_split
import pickle
import os
import h5py
from PIL import Image
import numpy as np
import datetime
import math
import random
import yaml
import argparse
import transforms3d
from tensorflow.keras.models import Model
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.layers import BatchNormalization, LeakyReLU, Add, Flatten, Dense, concatenate, Rescaling, Normalization, Conv2D, MaxPooling2D, Dropout
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras import Model, Input, regularizers
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from sklearn.model_selection import train_test_split


2025-01-22 00:56:07.977833: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-22 00:56:07.996068: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-22 00:56:08.001470: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-22 00:56:08.015436: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class DatasetLoader:
    def __init__(self, tfrecord_file, image_shape=[224, 224, 3], lasers_shape=513, goal_shape=2, motion_command_shape=3, reward_shape = 1):
        self.tfrecord_file = tfrecord_file
        self.image_shape = image_shape
        self.lasers_shape = lasers_shape
        self.goal_shape = goal_shape
        self.motion_command_shape = motion_command_shape
        self.reward_shape = reward_shape
        self.dataset_length = self._get_dataset_length()
        print(self.dataset_length)

    def _parse_with_lasers_function(self,proto):
        features = {
            'laser': tf.io.FixedLenFeature([self.lasers_shape], tf.float32),
            'goal': tf.io.FixedLenFeature([self.goal_shape], tf.float32),
            'next_laser': tf.io.FixedLenFeature([self.lasers_shape], tf.float32),
            'next_goal': tf.io.FixedLenFeature([self.goal_shape], tf.float32),
            'motion_command': tf.io.FixedLenFeature([self.motion_command_shape], tf.float32),
            'reward': tf.io.FixedLenFeature([self.reward_shape], tf.float32),
        }
        parsed_features = tf.io.parse_single_example(proto, features)
        return  (parsed_features['laser'], parsed_features['goal'], parsed_features['motion_command'], parsed_features['reward'],
                 parsed_features['next_laser'], parsed_features['next_goal'])

    def _get_dataset_length(self):
        dataset = tf.data.TFRecordDataset(self.tfrecord_file)
        metadata_features = {
            'metadata': tf.io.FixedLenFeature([], tf.string, default_value=''),
            'length': tf.io.FixedLenFeature([], tf.int64, default_value=-1)
        }
        
        for record in dataset.take(1):
            parsed_features = tf.io.parse_single_example(record, metadata_features)
            dataset_length = parsed_features['length'].numpy()
            return dataset_length
        
        print("Metadata not found. Setting length to None.")
        return None

    def load_dataset(self):
        dataset = tf.data.TFRecordDataset(self.tfrecord_file)
        dataset = dataset.skip(1).map(self._parse_with_lasers_function, num_parallel_calls=tf.data.AUTOTUNE)
        return dataset


    def split_dataset(self, dataset, train_size=0.7, val_size=0.2):
        num_elements = self.dataset_length
        if num_elements is None:
            raise ValueError("Dataset length is not set. Ensure the metadata is properly included in the TFRecord.")
        
        train_end = int(train_size * num_elements)
        val_end = int((train_size + val_size) * num_elements)
        
        train_dataset = dataset.take(train_end)
        val_dataset = dataset.skip(train_end).take(val_end - train_end)
        test_dataset = dataset.skip(val_end).take(num_elements - val_end)
     
        
        return train_dataset, val_dataset, test_dataset

    def get_prepared_datasets(self, train_size=0.7, val_size=0.2):
        dataset = self.load_dataset()
        
        train_dataset, val_dataset, test_dataset = self.split_dataset(dataset, train_size, val_size)
        
        return train_dataset, val_dataset, test_dataset


In [31]:
# Define the Actor Network (Policy)
class Actor(tf.keras.Model):
    def __init__(self, model_path):
        super(Actor, self).__init__()
        self.model = tf.keras.models.load_model(model_path, compile=False)

    def call(self, laser, goal):
        return self.model([laser, goal])

# Define the Critic Network (Q-value function)
class Critic(tf.keras.Model):
    def __init__(self, laser_shape, goal_shape, motion_command_shape, reward_shape):
        super(Critic, self).__init__()
        self.laser_shape = laser_shape
        self.goal_shape = goal_shape
        self.motion_command_shape = motion_command_shape
        self.reward_shape = reward_shape

        # Define normalization layers
        self.laser_normalization = Normalization()
        self.goal_normalization = Normalization()
        self.reward_normalization = Normalization()

        # Initialize the model
        self.model = self._create_model()
        
        
    
    def _create_model(self):
        """
        Creates a TensorFlow model for processing laser scans, goals, motion commands, and predicting rewards.
        
        :return: tf.keras.Model
        """
        # Define inputs
        laser_input = Input(shape=(self.laser_shape,), name='laser_input')
        goal_input = Input(shape=(self.goal_shape,), name='goal_input')
        motion_command_input = Input(shape=(self.motion_command_shape,), name='motion_command_input')
    
        # Optional normalization
        laser_normalized = self.laser_normalization(laser_input)
        goal_normalized = self.goal_normalization(goal_input)
        motion_command_normalized = self.reward_normalization(motion_command_input)
    
        # Laser processing
        laser_hidden = Dense(64, kernel_initializer=HeNormal(), kernel_regularizer=regularizers.l2(1e-4))(laser_normalized)
        laser_hidden = BatchNormalization()(laser_hidden)
        laser_hidden = LeakyReLU()(laser_hidden)
        laser_hidden = Dropout(0.2)(laser_hidden)
        laser_hidden = Dense(32, kernel_initializer=HeNormal(), kernel_regularizer=regularizers.l2(1e-4))(laser_hidden)
        laser_hidden = BatchNormalization()(laser_hidden)
        laser_hidden = LeakyReLU()(laser_hidden)

         # Goal processing
        goal_hidden = Dense(16, kernel_initializer=HeNormal(), kernel_regularizer=regularizers.l2(1e-4))(goal_normalized)
        goal_hidden = BatchNormalization()(goal_hidden)
        goal_hidden = LeakyReLU()(goal_hidden)
        goal_hidden = Dropout(0.2)(goal_hidden)
        goal_hidden = Dense(32, kernel_initializer=HeNormal(), kernel_regularizer=regularizers.l2(1e-4))(goal_hidden)
        goal_hidden = BatchNormalization()(goal_hidden)
        goal_hidden = LeakyReLU()(goal_hidden)
    
        # Motion command processing
        motion_command_hidden = Dense(8, kernel_initializer=HeNormal(), kernel_regularizer=regularizers.l2(1e-4))(motion_command_normalized)
        motion_command_hidden = BatchNormalization()(motion_command_hidden)
        motion_command_hidden = LeakyReLU()(motion_command_hidden)
        motion_command_hidden = Dropout(0.2)(motion_command_hidden)
        motion_command_hidden = Dense(32, kernel_initializer=HeNormal(), kernel_regularizer=regularizers.l2(1e-4))(motion_command_hidden)
        motion_command_hidden = BatchNormalization()(motion_command_hidden)
        motion_command_hidden = LeakyReLU()(motion_command_hidden)
    
        # Concatenate processed features
        concatenated = concatenate([laser_hidden, goal_hidden, motion_command_hidden])
    
        # Fully connected layers after concatenation
        hidden = Dense(32, kernel_initializer=HeNormal(), kernel_regularizer=regularizers.l2(1e-4))(concatenated)
        hidden = BatchNormalization()(hidden)
        hidden = LeakyReLU()(hidden)
        hidden = Dropout(0.2)(hidden)
        hidden = Dense(16, kernel_initializer=HeNormal(), kernel_regularizer=regularizers.l2(1e-4))(hidden)
        hidden = BatchNormalization()(hidden)
        hidden = LeakyReLU()(hidden)
        hidden = Dropout(0.2)(hidden)
    
        # Output layer for reward
        output = Dense(1, activation='linear', name='reward_output')(hidden)
    
        # Create and return the model
        model = Model(inputs=[laser_input, goal_input, motion_command_input], outputs=output)
        return model

    def call(self, laser, goal, action):
        return self.model([laser, goal, action])


# DDPG Agent
class DDPGAgent:
    def __init__(self, laser_shape, goal_shape, motion_command_shape, reward_shape, pretrained_actor_model_path, gamma=0.8, tau=0.005, lr=1e-3):
        self.actor = Actor(pretrained_actor_model_path)
        self.critic = Critic(laser_shape, goal_shape, motion_command_shape, reward_shape)
        self.target_actor = Actor(pretrained_actor_model_path)
        self.target_critic = Critic(laser_shape, goal_shape, motion_command_shape, reward_shape)

        # Initialize target networks to be the same as the original networks
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())

        self.critic_optimizer = optimizers.Adam(lr)
        
        self.gamma = gamma  # Discount factor for future rewards
        self.tau = tau  # Target network update rate      

            
    def update(self, train_batch):

        # Sample a batch of transitions from the replay buffer
        state_laser, state_goal, action, reward, next_state_laser, next_state_goal = train_batch
        state = [np.array(state_laser), np.array(state_goal)]
        action = np.array(action)
        reward = np.array(reward)
        next_state = [np.array(next_state_laser), np.array(next_state_goal)]
        done = 0
        
        target_action = self.target_actor(next_state[0], next_state[1])
        target_q_value = self.target_critic(next_state[0], next_state[1], target_action)
        target = reward + (1 - done) * self.gamma * target_q_value

        # Update Critic
        with tf.GradientTape() as tape:        
            current_q_value = self.critic(state[0], state[1], action)
            critic_loss = tf.reduce_mean(tf.square(current_q_value - target))

        critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
        
        self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))


        

        return critic_loss.numpy()

    def soft_update(self, source, target):
        source_weights = source.get_weights()
        target_weights = target.get_weights()

        new_weights = []
        for source_w, target_w in zip(source_weights, target_weights):
            new_weights.append(self.tau * source_w + (1.0 - self.tau) * target_w)

        target.set_weights(new_weights)

    def validate(self, val_dataset, val_steps, batch_size):
        
        val_dataset_= val_dataset.shuffle(buffer_size=1000).batch(batch_size)
        val_dataset_ = val_dataset_.prefetch(tf.data.AUTOTUNE)
        val_critic_loss=[]
        for step in range(val_steps):
            val_batch = next(iter(val_dataset_))
            print(step)
            state_laser, state_goal, action, reward, next_state_laser, next_state_goal = val_batch
            state = [np.array(state_laser), np.array(state_goal)]
            action = np.array(action)
            reward = np.array(reward)
            next_state = [np.array(next_state_laser), np.array(next_state_goal)]
            done = 0
            target_action = self.target_actor(next_state[0], next_state[1])
            target_q_value = self.target_critic(next_state[0], next_state[1], target_action)
            target = reward + (1 - done) * self.gamma * target_q_value
    
            current_q_value = self.critic(state[0], state[1], action)
            critic_loss = tf.reduce_mean(tf.square(current_q_value - target))
            val_critic_loss.append(critic_loss)
            print('v-end')
            
        val_critic_loss = sum(val_critic_loss)/len(val_critic_loss)
        return val_critic_loss.numpy()
 



In [32]:
# Training Loop
def train_critic(agent, train_dataset, val_dataset, batch_size, model_save_path, epochs=100, train_steps=1000, val_steps=200):

    for epoch in range(epochs):
        train_dataset_= train_dataset.shuffle(buffer_size=10000).batch(batch_size)
        train_dataset_ = train_dataset_.prefetch(tf.data.AUTOTUNE)
        for step in range(train_steps):
            train_batch = next(iter(train_dataset_))
            print('train',step)
            critic_loss = agent.update(train_batch)            
            if step%50==0:
                agent.soft_update(agent.critic, agent.target_critic)

            if epoch%5==0:            
                with writer.as_default():
                    tf_summary.scalar(f"Episode_{epoch} Critic Loss", critic_loss, step=step)
            print('t-end')
        print('epoch complete')
        if epoch%50==0:
            model_save_path = os.path.join(model_save_path, f"{model_name}_model_episode_{epoch+1}.keras")
            agent.critic.save(model_save_path)
            print('model saved')
        print('validating')
        val_critic_loss = agent.validate(val_dataset, val_steps, batch_size)
        
        # Log metrics
        with writer.as_default():
            tf_summary.scalar("Critic Loss", critic_loss, step=epoch)
            tf_summary.scalar("Validation Critic Loss", val_critic_loss, step=epoch)

def load_config(config_path):
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    return config

# Load the configuration
config = load_config('/work/nselva2s/rnd/repo/src/Reinforcement_Learning/Critic_Pretraining/config_rl_critic_laser.yaml')

root_dir = config['root_dir']
tf_file = os.path.join(root_dir, config['tfrecord_file'])

log_dir = config['log_dir'] +datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = os.path.join(root_dir, log_dir)
os.makedirs(log_dir, exist_ok=True)

model_save_path = os.path.join(root_dir, config['model_save_path'])
model_name = config['model_name']
os.makedirs(model_save_path, exist_ok=True)

pretrained_actor_model_path = config['pretrained_actor_model_path']
pretrained_actor_model_path = os.path.join(root_dir, pretrained_actor_model_path)

# epochs=config['epochs']
# train_steps = config['train_steps']
# batch_size = config['batch_size']
# val_steps = config['val_steps']
# test_steps = config['test_steps']

epochs=10
train_steps = 5
batch_size = 128
val_steps = 10
test_steps = 50

laser_shape = 513
goal_shape = 2
motion_command_shape = 3
reward_shape = 1

loader = DatasetLoader(tf_file)
train_dataset, val_dataset, test_dataset = loader.get_prepared_datasets(train_size = config['train_size'], val_size = config['val_size'])

writer = tf_summary.create_file_writer(log_dir)

agent = DDPGAgent(laser_shape, goal_shape, motion_command_shape, reward_shape, pretrained_actor_model_path=pretrained_actor_model_path) 
train_critic(agent, train_dataset, val_dataset, batch_size, model_save_path, epochs, train_steps, val_steps)
test_critic_loss = agent.validate(test_dataset, test_steps)
with writer.as_default():
    tf_summary.scalar("Test Critic Loss", test_critic_loss, step=epochs)


420699
train 0
t-end
train 1
t-end
train 2
t-end
train 3
t-end
train 4
t-end
epoch complete
model saved
validating


2025-01-21 23:41:50.582191: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:357: Filling up shuffle buffer (this may take a while): 1 of 1000
2025-01-21 23:41:50.712206: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.


KeyboardInterrupt: 

In [47]:
import keras
ds = loader.load_dataset()
ds = ds.shuffle(buffer_size=10000, reshuffle_each_iteration=False)
tr, v, tes = loader.split_dataset(ds, 0.1, 0.7)

In [49]:
val_dataset_= v.shuffle(buffer_size=1000).batch(batch_size)
val_dataset_ = val_dataset_.prefetch(tf.data.AUTOTUNE)
val_critic_loss=[]
for step in range(3):
    print(step)
    val_batch = next(iter(val_dataset_))
    print('jkdak')

0
jkdak
1
jkdak
2
jkdak


In [54]:
val_dataset_= v.shuffle(buffer_size=100000).batch(batch_size)
val_dataset_ = val_dataset_.prefetch(tf.data.AUTOTUNE)
val_critic_loss=[]
c = 0
for batch in val_dataset_:
    print(c)
    c=c+1
    if c>5:
        break
    

2025-01-22 00:18:07.604424: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:478: Filling up shuffle buffer (this may take a while): 24574 of 100000
2025-01-22 00:18:17.364178: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.


0
1
2
3
4
5


In [4]:
import ddpg_critic_pretraining_agent 
def load_config(config_path):
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    return config
    
# Load the configuration
config = load_config('/work/nselva2s/rnd/repo/src/Reinforcement_Learning/Critic_Pretraining/config_rl_critic_laser.yaml')

root_dir = config['root_dir']
tf_file = os.path.join(root_dir, config['tfrecord_file'])
loader = DatasetLoader(tf_file)
train_dataset, val_dataset, test_dataset = loader.get_prepared_datasets(train_size = config['train_size'], val_size = config['val_size'])
model = tf.keras.models.load_model('/work/nselva2s/rnd/ReinforcementLearning_models/pretrained_critic_laser_corr07112024_a_model_episode_1.keras',compile=False)



420699


In [9]:
test_dataset_= test_dataset.shuffle(buffer_size=100000).batch(128)
test_dataset_ = test_dataset_.prefetch(tf.data.AUTOTUNE)
for test_batch in test_dataset_:
    state_laser, state_goal, action, reward, next_state_laser, next_state_goal = test_batch
    q_pred = model([state_laser, state_goal, action])
    break
    

2025-01-22 01:44:39.922954: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:37: Filling up shuffle buffer (this may take a while): 1 of 100000
2025-01-22 01:44:46.449376: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.


ValueError: Layer "functional" expects 3 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor: shape=(128, 513), dtype=float32, numpy=
array([[1.334, 1.334, 1.335, ..., 2.193, 2.193, 2.196],
       [1.652, 1.648, 1.637, ..., 2.379, 2.363, 2.355],
       [4.525, 4.523, 4.519, ..., 1.234, 1.231, 1.229],
       ...,
       [1.099, 1.09 , 1.089, ..., 2.435, 2.436, 2.435],
       [1.402, 1.402, 1.402, ..., 0.799, 0.797, 0.797],
       [0.595, 0.593, 0.593, ..., 2.787, 2.787, 2.784]], dtype=float32)>]

In [10]:
q_pred = model([state_laser, state_goal, action])

In [11]:
q_pred

<tf.Tensor: shape=(128, 1), dtype=float32, numpy=
array([[  5.566354  ],
       [  6.1339426 ],
       [  4.0919085 ],
       [  6.962619  ],
       [ 20.425295  ],
       [ 12.030766  ],
       [ 34.029587  ],
       [  3.683224  ],
       [ 12.870767  ],
       [ 16.84511   ],
       [  3.7487519 ],
       [ 19.663013  ],
       [  8.189832  ],
       [  5.464986  ],
       [ 16.01029   ],
       [ 12.051841  ],
       [ 14.108821  ],
       [  4.1974897 ],
       [ 13.011275  ],
       [  8.291923  ],
       [ 10.486874  ],
       [  8.067769  ],
       [ 11.515433  ],
       [  6.653762  ],
       [  5.9220457 ],
       [  5.9138546 ],
       [ 28.936562  ],
       [ 11.216077  ],
       [ 20.396439  ],
       [ 25.065018  ],
       [  4.876897  ],
       [ 16.006666  ],
       [112.53276   ],
       [ 15.004068  ],
       [ 18.035015  ],
       [130.58313   ],
       [  6.818269  ],
       [ 10.761857  ],
       [ 12.59332   ],
       [ 24.407187  ],
       [ 15.069953  ],
       

In [12]:
reward

<tf.Tensor: shape=(128, 1), dtype=float32, numpy=
array([[-20.85164  ],
       [ -2.9716349],
       [  4.684072 ],
       [  5.127513 ],
       [  5.8854237],
       [-11.422145 ],
       [  9.16414  ],
       [-22.386084 ],
       [  7.8355236],
       [  8.873653 ],
       [  3.5653758],
       [  9.157463 ],
       [  7.9223695],
       [-13.675498 ],
       [  8.801754 ],
       [  8.186878 ],
       [  3.9918492],
       [  6.7647505],
       [  5.0734725],
       [  6.689303 ],
       [  6.2356534],
       [  6.0028753],
       [ -5.9903064],
       [-12.332437 ],
       [-12.332856 ],
       [  7.667494 ],
       [  8.9350195],
       [  8.143522 ],
       [  8.997749 ],
       [  8.389065 ],
       [  3.9739268],
       [  5.051523 ],
       [119.30976  ],
       [  8.425279 ],
       [  7.1160355],
       [129.44112  ],
       [  4.004947 ],
       [  4.441172 ],
       [  4.3580017],
       [  8.447446 ],
       [  8.488038 ],
       [ -3.9376974],
       [ 99.32358  ],
    