In [1]:
import shutil
import os
import pathlib
import numpy as np
import pandas as pd
import plotly.express as px
import kaggle
import datetime
import tensorflow as tf
import xml.etree.ElementTree as ET

from functools import partial



2025-01-26 18:04:04.526288: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-26 18:04:04.535577: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737936244.546467   67109 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737936244.549795   67109 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-26 18:04:04.561101: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

# Implmentation Outline
- Do pretraining for image classification to learn features
- Convert model to object detection
- Make custom loss function
- Get object detection dataset
- Train according to paper specifications
- Test model

## Pretraining

For pretraining I will use a smaller version of the ImageNet dataset then the one used in the paper.

In [2]:
## Load pretraining data
kaggle.api.authenticate()
# kaggle.api.dataset_download_files('ifigotin/imagenetmini-1000',
#                                 path='/home/misha/Desktop/data/yolo_paper/pretrain_data/image_data/',
#                                 unzip=True)



### Data Preprocessing

In [3]:
PRETRAIN_DATA_PATH = '/home/misha/Desktop/data/yolo_paper/pretrain_data/'
labels_txt = PRETRAIN_DATA_PATH + 'words.txt'

In [4]:
labels = pd.read_csv(labels_txt, sep='\t')

In [5]:
labels

Unnamed: 0,code,object
0,n00001740,entity
1,n00001930,physical entity
2,n00002137,"abstraction, abstract entity"
3,n00002452,thing
4,n00002684,"object, physical object"
...,...,...
82110,n15299225,study hall
82111,n15299367,"Transfiguration, Transfiguration Day, August 6"
82112,n15299585,usance
82113,n15299783,window


In [6]:
# labels[labels['code'] == 'n03485794214']['object'].values[0]

In [7]:
n_classes = len(os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/train/'))

In [8]:
n_classes

999

In [9]:
len(os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/val/'))

999

In [10]:
os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/val/') == os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/train/')

True

In [11]:
os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/')

['val', 'train']

#### Rename the folders to their label instead of the code

In [12]:
# for split in os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/'):
#     for folder in os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/' + split):
#         try:
#             label = labels[labels['code'] == folder]['object'].values[0]
#         except IndexError as err:
#             print(f"No label found for Split: {split} Folder: {folder}")
#             continue

#         ## Rename folder
#         source_path = PRETRAIN_DATA_PATH + f'image_data/imagenet-mini/{split}/{folder}'
#         destination_path = PRETRAIN_DATA_PATH + f'image_data/imagenet-mini/{split}/{label}'

#         try:
#             shutil.move(source_path, destination_path)
#         except OSError as err:
#             print(f"Error moving folder({folder}): {err}")

In [13]:
os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/val/') == os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/train/')

True

#### Move load data into Tensorflow datasets.

In [14]:
train_dir = pathlib.Path(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/train/')
val_dir = pathlib.Path(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/val/')

In [15]:
train_df = tf.keras.utils.image_dataset_from_directory(
    train_dir, 
    labels='inferred',
    color_mode='rgb',
    batch_size=32, 
    label_mode='categorical',   ## Vector Representation (Use categorical_crossentropy loss)
    image_size=(448, 448),
    crop_to_aspect_ratio=True,
    seed=1, 
    shuffle=True
)
val_df = tf.keras.utils.image_dataset_from_directory(
    val_dir, 
    labels='inferred',
    color_mode='rgb',
    batch_size=32, 
    label_mode='categorical',   ## Vector Representation (Use categorical_crossentropy loss)
    image_size=(448, 448),
    crop_to_aspect_ratio=True,
    seed=1, 
    shuffle=True
)

Found 34745 files belonging to 999 classes.


I0000 00:00:1737936251.736491   67109 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9678 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:01:00.0, compute capability: 8.6


Found 3923 files belonging to 999 classes.


#### Data augmentation

In [16]:
# data_augmentation = tf.keras.Sequential([
#     tf.keras.layers.RandomFlip('horizontal', seed=1),
#     tf.keras.layers.RandomRotation(0.2, seed=1)
# ])

In [17]:
# train_aug = train_df.map(lambda x, y: (data_augmentation(x), y))
# val_aug = val_df.map(lambda x, y: (data_augmentation(x), y))

In [18]:
# train_df = train_df.concatenate(train_aug).prefetch(1)
# val_df = val_df.concatenate(val_aug).prefetch(1)

In [19]:
# len(train_df) * 32

In [20]:
# len(val_df) * 32

### Pretraining

In [21]:
## Partial layers
# Conv2DLayer = partial(tf.keras.layers.Conv2D, strides=(1,1), padding='same',
#                       kernel_initializer='he_normal')
# MaxPoolLayer = partial(tf.keras.layers.MaxPool2D, pool_size=(2,2), strides=2, padding='same',)

In [22]:
# pretraining_model = tf.keras.Sequential()

# # Input layer (only for pretraining, will be removed when applied to full model)
# pretraining_model.add(tf.keras.layers.InputLayer(shape=(224,224,3)))

# ## First 20 layers according to paper
# pretraining_model.add(Conv2DLayer(filters=64, strides=(2,2), kernel_size=(7,7)))
# pretraining_model.add(MaxPoolLayer())
# pretraining_model.add(tf.keras.layers.BatchNormalization())

# pretraining_model.add(Conv2DLayer(filters=192, kernel_size=(3,3)))
# pretraining_model.add(MaxPoolLayer())
# pretraining_model.add(tf.keras.layers.BatchNormalization())

# pretraining_model.add(Conv2DLayer(filters=128, kernel_size=(1,1)))
# pretraining_model.add(Conv2DLayer(filters=256, kernel_size=(3,3)))
# pretraining_model.add(Conv2DLayer(filters=256, kernel_size=(1,1)))
# pretraining_model.add(Conv2DLayer(filters=512, kernel_size=(3,3)))
# pretraining_model.add(MaxPoolLayer())
# pretraining_model.add(tf.keras.layers.BatchNormalization())

# for _ in range(4):
#     pretraining_model.add(Conv2DLayer(filters=256, kernel_size=(1,1)))
#     pretraining_model.add(Conv2DLayer(filters=512, kernel_size=(3,3)))
#     pretraining_model.add(tf.keras.layers.BatchNormalization())
# pretraining_model.add(Conv2DLayer(filters=512, kernel_size=(1,1)))
# pretraining_model.add(Conv2DLayer(filters=1024, kernel_size=(3,3)))
# pretraining_model.add(MaxPoolLayer())
# pretraining_model.add(tf.keras.layers.BatchNormalization())

# ## Flatten Layer
# pretraining_model.add(tf.keras.layers.Flatten())
# pretraining_model.add(tf.keras.layers.BatchNormalization())

# ## Output layer for pretraining (will be removed when layers are reused)
# pretraining_model.add(tf.keras.layers.Dense(n_classes, activation='softmax'))

In [23]:
# pretraining_model.summary()

In [24]:
## Loaded model from checkpoint
# pretraining_model = tf.keras.models.load_model('best_pretrain_model.keras')

In [25]:
## Pretraining
# optimizer = tf.keras.optimizers.SGD(learning_rate=1e-6, momentum=0.9, nesterov=True)
# pretraining_model.compile(optimizer=optimizer,
#                           loss=tf.keras.losses.categorical_crossentropy,
#                           metrics=['accuracy'])

In [26]:
# checkpoints = tf.keras.callbacks.ModelCheckpoint('best_pretrain_model.keras', monitor='val_accuracy',
#                                                  verbose=1, save_best_only=True)
# tb_dir = 'logs/yolo/pretrain/' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tb_dir)

# pretraining_model.fit(train_df, validation_data=val_df, epochs=25, 
#                      callbacks=[checkpoints, tensorboard_callback])

## Using transfer learning for pretraining

In order to save time I will just do transfer learning for this pretraining since it shouldn't affect the actual model a lot.

We will use EfficientNetV2M

### Model fine tune

In [16]:
preprocess_func = tf.keras.applications.efficientnet_v2.preprocess_input

train_df = train_df.map(lambda X, y: (preprocess_func(X), y)).prefetch(2)
val_df = val_df.map(lambda X, y: (preprocess_func(X), y)).prefetch(2)

In [17]:
base_pretrain_model = tf.keras.applications.efficientnet_v2.EfficientNetV2M(weights='imagenet', 
                                                                           include_top=False,
                                                                           input_shape=(448, 448, 3))
avg = tf.keras.layers.GlobalAveragePooling2D()(base_pretrain_model.output)
output = tf.keras.layers.Dense(n_classes, activation='softmax')(avg)
model = tf.keras.Model(inputs=base_pretrain_model.input, outputs=output)

In [18]:
for layer in base_pretrain_model.layers:
    layer.trainable = False

In [None]:
optimizer = tf.keras.optimizers.AdamW(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
history = model.fit(train_df, validation_data=val_df, epochs=4)

Epoch 1/4


I0000 00:00:1737936277.747100   67240 service.cc:148] XLA service 0x725d80014c40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1737936277.747126   67240 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2025-01-26 18:04:38.597756: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1737936280.987938   67240 cuda_dnn.cc:529] Loaded cuDNN version 90300







I0000 00:00:1737936309.129673   67240 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1085/1086[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 282ms/step - accuracy: 0.6471 - loss: 3.5178  









[1m1086/1086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 307ms/step - accuracy: 0.6472 - loss: 3.5161







[1m1086/1086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m433s[0m 356ms/step - accuracy: 0.6474 - loss: 3.5145 - val_accuracy: 0.8348 - val_loss: 0.6955
Epoch 2/4
[1m1086/1086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m339s[0m 312ms/step - accuracy: 0.9309 - loss: 0.2652 - val_accuracy: 0.8358 - val_loss: 0.6918
Epoch 3/4
[1m1086/1086[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 314ms/step - accuracy: 0.9440 - loss: 0.1906 - val_accuracy: 0.8318 - val_loss: 0.7256
Epoch 4/4
[1m 668/1086[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m1:57[0m 281ms/step - accuracy: 0.9549 - loss: 0.1490

In [None]:
pd.DataFrame(history.history).plot()

In [None]:
len(base_pretrain_model.layers)

In [None]:
## Unfreeze some layers and retrain
for layer in base_pretrain_model.layers[:75]:
    layer.trainable = True

In [None]:
optimizer = tf.keras.optimizers.AdamW(learning_rate=1e-3)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', restore_best_weights=True, patience=2)

history = model.fit(train_df, validation_data=val_df, epochs=10, callbacks=[early_stop])

In [None]:
model.save('pretrain_model.keras', overwrite=True)

In [None]:
pretrain_model = tf.keras.models.load_model('pretrain_model.keras')

## YOLO Model

### Outline for building
- Get Dataset Pascal VOC or whatever is said in paper
- Build loss function
- Figure out infrence
- Build Model class
- Train

### Download and Clean Dataset

In [38]:
# kaggle.api.dataset_download_files('gopalbhattrai/pascal-voc-2012-dataset',
#                                  path='/home/misha/Desktop/data/yolo_paper/train_data',
#                                  unzip=True)

In [39]:
TRAIN_DATAPATH = '/home/misha/Desktop/data/yolo_paper/train_data/'

In [40]:
def get_center_coords(bbox):
    xmin, ymin, xmax, ymax = bbox
    center_x = (xmin + xmax) / 2
    center_y = (ymin + ymax) / 2
    return center_x, center_y

In [89]:
# Create the label map
label_map = {
    "aeroplane": 0,
    "bicycle": 1,
    "bird": 2,
    "boat": 3,
    "bottle": 4,
    "bus": 5,
    "car": 6,
    "cat": 7,
    "chair": 8,
    "cow": 9,
    "diningtable": 10,
    "dog": 11,
    "horse": 12,
    "motorbike": 13,
    "person": 14,
    "pottedplant": 15,
    "sheep": 16,
    "sofa": 17,
    "train": 18,
    "tvmonitor": 19
}

In [147]:
def pascal_voc_generator(image_dir, annotation_dir, image_set_file):
    image_dir = str(image_dir)
    annotation_dir = str(annotation_dir)
    
    with open(image_set_file, 'r') as f:
        ## Get list of image ids for the split
        image_ids = [line.strip() for line in f]
    for image_id in image_ids:
        image_id = str(image_id)
        
        ## For each id, get corresponding image and annotation file
        image_path = str(os.path.join(image_dir, f"{image_id}.jpg")).replace('b\'', '').replace('\'', '')
        annotation_path = str(os.path.join(annotation_dir, f"{image_id}.xml")).replace('b\'', '').replace('\'', '')

        ## Ensure the paths are strings and normalized
        image_path = os.path.normpath(image_path)
        annotation_path = os.path.normpath(annotation_path)
        
        ## Load the image
        try:

            if not os.path.exists(image_path):
                print(f"Image file not found: {image_path}")
                continue
        
            if not os.path.exists(annotation_path):
                print(f"Annotation file not found: {annotation_path}")
                continue
            image = tf.keras.preprocessing.image.load_img(image_path)
            image = tf.keras.preprocessing.image.img_to_array(image)
    
            ## Parse the XML file
            tree = ET.parse(annotation_path)
            root = tree.getroot()
            bboxes = []
            labels = []
            for obj in root.findall('object'):
                bbox = obj.find('bndbox')
                xmin = int(bbox.find('xmin').text)
                ymin = int(bbox.find('ymin').text)
                xmax = int(bbox.find('xmax').text)
                ymax = int(bbox.find('ymax').text)
    
                ## Get actual features
                center_x, center_y = get_center_coords([xmin, ymin, xmax, ymax])
                # Calculate width and height
                width = xmax - xmin
                height = ymax - ymin
    
                ## Convert label to numeric value
                label_name = obj.find('name').text.lower()
                label = label_map[label_name]
                
                bboxes.append([center_x, center_y, width, height])
                labels.append(label)
    
            yield image, bboxes, labels
        except Exception as err:
            print(f'''Error ocurred:
                Image id: {image_id}
                Error Message: {err}
            ''')


In [148]:
image_dir = '/home/misha/Desktop/data/yolo_paper/train_data/VOC2012_train_val/VOC2012_train_val/JPEGImages'
annotation_dir = '/home/misha/Desktop/data/yolo_paper/train_data/VOC2012_train_val/VOC2012_train_val/Annotations'
image_set_file = '/home/misha/Desktop/data/yolo_paper/train_data/VOC2012_train_val/VOC2012_train_val/ImageSets/Main/train.txt'

train_df = tf.data.Dataset.from_generator(
    pascal_voc_generator,
    args=(image_dir, annotation_dir, image_set_file),
    output_signature=(
        tf.TensorSpec(shape=(None, None, 3), dtype=tf.float32), # Image Shape
        tf.TensorSpec(shape=(None, 4), dtype=tf.float32), # Bboxes shape
        tf.TensorSpec(shape=(None,), dtype=tf.int32) # Label shape
    )
)


## Val DF now
image_dir = '/home/misha/Desktop/data/yolo_paper/train_data/VOC2012_train_val/VOC2012_train_val/JPEGImages'
annotation_dir = '/home/misha/Desktop/data/yolo_paper/train_data/VOC2012_train_val/VOC2012_train_val/Annotations'
image_set_file = '/home/misha/Desktop/data/yolo_paper/train_data/VOC2012_train_val/VOC2012_train_val/ImageSets/Main/val.txt'

val_df = tf.data.Dataset.from_generator(
    pascal_voc_generator,
    args=(image_dir, annotation_dir, image_set_file),
    output_signature=(
        tf.TensorSpec(shape=(None, None, 3), dtype=tf.float32), # Image Shape
        tf.TensorSpec(shape=(None, 4), dtype=tf.float32), # Bboxes shape
        tf.TensorSpec(shape=(None,), dtype=tf.int32) # Label shape
    )
)

## Test DF
image_dir = '/home/misha/Desktop/data/yolo_paper/train_data/VOC2012_test/VOC2012_test/JPEGImages'
annotation_dir = '/home/misha/Desktop/data/yolo_paper/train_data/VOC2012_test/VOC2012_test/Annotations'
image_set_file = '/home/misha/Desktop/data/yolo_paper/train_data/VOC2012_test/VOC2012_test/ImageSets/Main/test.txt'

test_df = tf.data.Dataset.from_generator(
    pascal_voc_generator,
    args=(image_dir, annotation_dir, image_set_file),
    output_signature=(
        tf.TensorSpec(shape=(None, None, 3), dtype=tf.float32), # Image Shape
        tf.TensorSpec(shape=(None, 4), dtype=tf.float32), # Bboxes shape
        tf.TensorSpec(shape=(None,), dtype=tf.int32) # Label shape
    )
)

In [149]:
train_df = train_df.prefetch(1)
val_df = val_df.prefetch(1)
test_df = test_df.prefetch(1)

In [150]:
def visualize_tf_dataset(dataset, num_instances=1):
    """
    Visualize instances from a TensorFlow dataset with bounding boxes.
    
    Args:
        dataset (tf.data.Dataset): The TensorFlow dataset to visualize.
        num_instances (int, optional): Number of instances to visualize. Defaults to 1.
    """
    # Create class names list based on the global label_map
    class_names = [key for key, _ in sorted(label_map.items(), key=lambda x: x[1])]
    
    # Track the number of instances visualized
    instances_visualized = 0
    
    # Iterate through the dataset
    for image, bboxes, labels in dataset:
        # Convert to numpy
        image_np = image.numpy()
        bboxes_np = bboxes.numpy()
        labels_np = labels.numpy()
        
        # Create a Plotly figure for this instance
        fig = px.imshow(image_np)
        fig.update_layout(
            showlegend=False,
            margin=dict(l=0, r=0, t=0, b=0),
            xaxis=dict(showticklabels=False, showgrid=False),
            yaxis=dict(showticklabels=False, showgrid=False)
        )
        
        # Add bounding boxes
        for bbox, label in zip(bboxes_np, labels_np):
            # Unpack bbox (center_x, center_y, width, height)
            center_x, center_y, width, height = bbox
            
            # Calculate corner coordinates
            xmin = center_x - width / 2
            ymin = center_y - height / 2
            xmax = center_x + width / 2
            ymax = center_y + height / 2
            
            # Add rectangle shape for bounding box
            fig.add_shape(
                type="rect",
                x0=xmin, 
                y0=ymin, 
                x1=xmax, 
                y1=ymax,
                line=dict(color="red", width=2),
                fillcolor="rgba(255,0,0,0.1)"  # Slight red fill for visibility
            )
            
            # Add label annotation
            fig.add_annotation(
                x=center_x,
                y=ymin,  # Place text just above the bounding box
                text=class_names[label],
                showarrow=True,
                arrowhead=1,
                ax=0,
                ay=-40,  # Move the text up
                font=dict(color="red", size=12),
                bordercolor="white",
                borderwidth=2,
                borderpad=4,
                bgcolor="white"
            )
        
        # Show the figure
        fig.show()
        
        # Increment instances and break if we've reached the desired number
        instances_visualized += 1
        if instances_visualized >= num_instances:
            break

In [154]:
visualize_tf_dataset(test_df, num_instances=3)

Annotation file not found: /home/misha/Desktop/data/yolo_paper/train_data/VOC2012_test/VOC2012_test/Annotations/2008_000001.xml
Annotation file not found: /home/misha/Desktop/data/yolo_paper/train_data/VOC2012_test/VOC2012_test/Annotations/2008_000004.xml
Annotation file not found: /home/misha/Desktop/data/yolo_paper/train_data/VOC2012_test/VOC2012_test/Annotations/2008_000005.xml
Annotation file not found: /home/misha/Desktop/data/yolo_paper/train_data/VOC2012_test/VOC2012_test/Annotations/2008_000006.xml
Annotation file not found: /home/misha/Desktop/data/yolo_paper/train_data/VOC2012_test/VOC2012_test/Annotations/2008_000010.xml
Annotation file not found: /home/misha/Desktop/data/yolo_paper/train_data/VOC2012_test/VOC2012_test/Annotations/2008_000011.xml
Annotation file not found: /home/misha/Desktop/data/yolo_paper/train_data/VOC2012_test/VOC2012_test/Annotations/2008_000012.xml
Annotation file not found: /home/misha/Desktop/data/yolo_paper/train_data/VOC2012_test/VOC2012_test/Anno

### Build Loss Function

In [1]:
class YOLOLoss(tf.keras.losses.Loss):
    

SyntaxError: incomplete input (394256117.py, line 2)