In [1]:
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TerminateOnNaN, CSVLogger
from keras import backend as K
from keras.models import load_model
from math import ceil
import numpy as np
import matplotlib.pyplot as plt

from models.keras_ssd7 import build_model
from keras_loss_function.keras_ssd_loss import SSDLoss
from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
from keras_layers.keras_layer_DecodeDetections import DecodeDetections
from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast

from ssd_encoder_decoder.ssd_input_encoder import SSDInputEncoder
from ssd_encoder_decoder.ssd_output_decoder import decode_detections, decode_detections_fast

from data_generator.object_detection_2d_data_generator import DataGenerator
from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms
from data_generator.data_augmentation_chain_variable_input_size import DataAugmentationVariableInputSize
from data_generator.data_augmentation_chain_constant_input_size import DataAugmentationConstantInputSize
from data_generator.data_augmentation_chain_original_ssd import SSDDataAugmentation

%matplotlib inline

2025-08-07 22:11:15.639124: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754579475.652350    9446 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754579475.656265    9446 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754579475.667528    9446 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754579475.667552    9446 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754579475.667553    9446 computation_placer.cc:177] computation placer alr

In [2]:
# Height, widths, channels of input images
img_height = 300
img_width = 480
img_channels = 3

# This settings transform the input pixel values to the interval [-1,1]
intensity_mean = 127.5
intensity_range = 127.5

# Number of classes
n_classes = 5

# Explitcit list of anchor box scales -> override "min_scale" and "max_scale" arguments
scales = [0.08, 0.16, 0.32, 0.64, 0.96]

aspect_ratios = [0.5, 1.0, 2.0]
two_boxes_for_ar1 = True
steps = None
offsets = None
clip_boxes = False
variances = [1.0, 1.0, 1.0, 1.0]
normalize_coords = True


In [3]:
model = build_model(image_size=(img_height, img_width, img_channels),
                    n_classes=n_classes,
                    mode='training',
                    l2_regularization=0.0005,
                    scales=scales,
                    aspect_ratios_global=aspect_ratios,
                    aspect_ratios_per_layer=None,
                    two_boxes_for_ar1=two_boxes_for_ar1,
                    steps=steps,
                    offsets=offsets,
                    clip_boxes=clip_boxes,
                    variances=variances,
                    normalize_coords=normalize_coords,
                    subtract_mean=intensity_mean,
                    divide_by_stddev=intensity_range)

adam = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0)

model.compile(optimizer=adam, loss=ssd_loss.compute_loss)

I0000 00:00:1754579478.130456    9446 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2156 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [4]:
# train_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None)
# val_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None)

# images_dir = 'udacity_driving_datasets/'

# train_labels_filename = 'udacity_driving_datasets/labels_train.csv'
# val_labels_filename = 'udacity_driving_datasets/labels_val.csv'


# train_dataset.parse_csv(images_dir=images_dir,
#                         labels_filename=train_labels_filename,
#                         input_format=['image_name', 'xmin', 'xmax', 'ymin', 'ymax', 'class_id'], # This is the order of the first six columns in the CSV file that contains the labels for your dataset. If your labels are in XML format, maybe the XML parser will be helpful, check the documentation.
#                         include_classes='all')
                    
# val_dataset.parse_csv(images_dir=images_dir,
#                       labels_filename=val_labels_filename,
#                       input_format=['image_name', 'xmin', 'xmax', 'ymin', 'ymax', 'class_id'],
#                       include_classes='all')

# train_dataset.create_hdf5_dataset(file_path='dataset_traffic_train.h5',
#                                   resize=False,
#                                   variable_image_size=True,
#                                   verbose=True)

# val_dataset.create_hdf5_dataset(file_path='dataset_traffic_val.h5',
#                                 resize=False,
#                                 variable_image_size=True,
#                                 verbose=True)

train_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path='dataset_traffic_train.h5')
val_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path='dataset_traffic_val.h5')

print(f"Number of images in the training datasets: {train_dataset.get_dataset_size()}")
print(f"Number of images in the validation datasets: {val_dataset.get_dataset_size()}")

Loading labels: 100%|██████████| 18000/18000 [00:01<00:00, 12478.97it/s]
Loading image IDs: 100%|██████████| 18000/18000 [00:00<00:00, 20088.95it/s]
Loading labels: 100%|██████████| 4241/4241 [00:00<00:00, 11668.70it/s]
Loading image IDs: 100%|██████████| 4241/4241 [00:00<00:00, 20692.43it/s]
Number of images in the training datasets: 18000
Number of images in the validation datasets: 4241


In [5]:
dir(train_dataset)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'create_hdf5_dataset',
 'dataset_indices',
 'dataset_size',
 'eval_neutral',
 'filenames',
 'generate',
 'get_dataset',
 'get_dataset_size',
 'hdf5_dataset',
 'hdf5_dataset_path',
 'image_ids',
 'images',
 'labels',
 'labels_format',
 'labels_output_format',
 'load_hdf5_dataset',
 'load_images_into_memory',
 'parse_csv',
 'parse_json',
 'parse_xml',
 'save_dataset']

In [6]:
vars(train_dataset)

{'labels_output_format': ('class_id', 'xmin', 'ymin', 'xmax', 'ymax'),
 'labels_format': {'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4},
 'dataset_size': 18000,
 'load_images_into_memory': False,
 'images': None,
 'filenames': None,
 'labels': [array([[  1, 237, 143, 251, 155],
         [  3, 437, 120, 454, 186]], dtype=int32),
  array([[  1, 218, 146, 231, 158]], dtype=int32),
  array([[  1, 179, 144, 191, 155],
         [  1, 206, 145, 220, 156],
         [  1, 385, 122, 420, 152],
         [  1, 411, 124, 462, 148],
         [  2, 171, 141, 182, 154]], dtype=int32),
  array([[  1, 171, 144, 182, 157],
         [  1, 200, 145, 214, 157],
         [  1, 362, 118, 430, 149],
         [  1, 433, 124, 479, 148],
         [  2, 165, 140, 177, 154]], dtype=int32),
  array([[  1, 165, 144, 177, 156],
         [  1, 194, 144, 207, 156],
         [  1, 373, 118, 457, 152],
         [  2, 160, 140, 173, 153]], dtype=int32),
  array([[  1, 160, 144, 172, 155],
         [  1, 189, 1

In [None]:
batch_size = 8

# Define the image processing chain
data_augmentation_chain = DataAugmentationConstantInputSize(random_brightness=(-48, 48, 0.5),
                                                            random_contrast=(0.5, 1.8, 0.5),
                                                            random_saturation=(0.5, 1.8, 0.5),
                                                            random_hue=(18, 0.5),
                                                            random_flip=0.5,
                                                            random_translate=((0.03,0.5), (0.03,0.5), 0.5),
                                                            random_scale=(0.5, 2.0, 0.5),
                                                            n_trials_max=3,
                                                            clip_boxes=True,
                                                            overlap_criterion='area',
                                                            bounds_box_filter=(0.3, 1.0),
                                                            bounds_validator=(0.5, 1.0),
                                                            n_boxes_min=1,
                                                            background=(0,0,0))

In [8]:
# Instantiate an encoder that can encode ground truth labels into the format needed by the SSD loss function
# The encoder constructor needs the spatial dimensions of the model's predictor layers
predictor_sizes = [model.get_layer('classes4').output.shape[1:3],
                   model.get_layer('classes5').output.shape[1:3],
                   model.get_layer('classes6').output.shape[1:3],
                   model.get_layer('classes7').output.shape[1:3]]

ssd_input_encoder = SSDInputEncoder(img_height=img_height,
                                    img_width=img_width,
                                    n_classes=n_classes,
                                    predictor_sizes=predictor_sizes,
                                    scales=scales,
                                    aspect_ratios_global=aspect_ratios,
                                    two_boxes_for_ar1=two_boxes_for_ar1,
                                    steps=steps,
                                    offsets=offsets,
                                    clip_boxes=clip_boxes,
                                    variances=variances,
                                    matching_type='multi',
                                    pos_iou_threshold=0.5,
                                    neg_iou_limit=0.3,
                                    normalize_coords=normalize_coords)

In [9]:
# Create the generator handles that will be passed to Keras "fit()" function
train_generator = train_dataset.generate(batch_size=batch_size,
                                         shuffle=True,
                                         transformations=[data_augmentation_chain],
                                         label_encoder=ssd_input_encoder,
                                         returns={'processed_images',
                                                  'encoded_labels'},
                                         keep_images_without_gt=False)

val_generator = val_dataset.generate(batch_size=batch_size,
                                     shuffle=False,
                                     transformations=[],
                                     label_encoder=ssd_input_encoder,
                                     returns={'processed_images',
                                              'encoded_labels'},
                                     keep_images_without_gt=False)

In [10]:
# Define model callbacks
# Set the file path to save the weights
model_checkpoint = ModelCheckpoint(filepath='ssd7_epoch-{epoch:02}_loss-{loss:.4f}_val_loss-{val_loss:.4f}.h5',
                                   monitor='val_loss',
                                   verbose=1,
                                   save_best_only=True,
                                   save_weights_only=False,
                                   mode='auto',
                                   save_freq='epoch')

csv_logger = CSVLogger(filename='ssd7_training_log.csv',
                       append=True)

early_stopping = EarlyStopping(monitor='val_loss',
                                min_delta=0,
                                patience=10,
                                verbose=1)

terminate_on_nan = TerminateOnNaN()

reduce_learning_rate = ReduceLROnPlateau(monitor='val_loss',
                                         patience=8,
                                         factor=0.2,
                                         verbose=1,
                                         epsilon=0.001,
                                         cooldown=0,
                                         min_lr=0.00001)

callbacks = [model_checkpoint,
            csv_logger,
            early_stopping,
            terminate_on_nan,
            reduce_learning_rate]


In [None]:
initial_epoch = 0
final_epoch = 50
steps_per_epoch = 2000
vald_dataset_size = val_dataset.get_dataset_size()

history = model.fit(train_generator,
                    steps_per_epoch=steps_per_epoch,
                    epochs=final_epoch,
                    callbacks=callbacks,
                    validation_data=val_generator,
                    validation_steps=ceil(vald_dataset_size/batch_size),
                    initial_epoch=initial_epoch, verbose=1)

Epoch 1/50


I0000 00:00:1754579486.721037   10564 service.cc:152] XLA service 0x71c08c0255a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1754579486.721055   10564 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2025-08-07 22:11:26.887625: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1754579489.032557   10564 cuda_dnn.cc:529] Loaded cuDNN version 90300
2025-08-07 22:11:30.482854: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Constant folding an instruction is taking > 1s:

  %add.1 = f32[16,300,480,3]{3,2,1,0} add(f32[16,300,480,3]{3,2,1,0} %constant.72, f32[16,300,480,3]{3,2,1,0} %broadcast.30), metadata={op_type="Sub" op_name="functional_1/input_mean_normalization_1/sub" source_file="/home/nguyen-van-anh/anaconda3/envs/tf/lib/python3.11/site-packages

[1m   1/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m11:52:22[0m 43s/step - loss: 26.1815

I0000 00:00:1754579524.872049   10564 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2025-08-07 22:12:11.850371: E external/local_xla/xla/service/slow_operation_alarm.cc:73] Constant folding an instruction is taking > 4s:

  %add.1 = f32[16,300,480,3]{3,2,1,0} add(f32[16,300,480,3]{3,2,1,0} %constant.72, f32[16,300,480,3]{3,2,1,0} %broadcast.30), metadata={op_type="Sub" op_name="functional_1/input_mean_normalization_1/sub" source_file="/home/nguyen-van-anh/anaconda3/envs/tf/lib/python3.11/site-packages/tensorflow/python/framework/ops.py" source_line=1200}

This isn't necessarily a bug; constant-folding is inherently a trade-off between compilation time and speed at runtime. XLA has some guards that attempt to keep constant folding from taking too long, but fundamentally you'll always be able to come up with an input program that takes a long time.

If you'd like to file a bug, run with envvar XLA_FLAGS=--xla_dump_to

[1m  23/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:13:59[0m 19s/step - loss: 17.4445