In [45]:
import shutil
import os
import pathlib
import numpy as np
import pandas as pd
import plotly.express as px
import kaggle
import datetime
import tensorflow as tf
import xml.etree.ElementTree as ET

from functools import partial

# Implmentation Outline
- Do pretraining for image classification to learn features
- Convert model to object detection
- Make custom loss function
- Get object detection dataset
- Train according to paper specifications
- Test model

## Pretraining

For pretraining I will use a smaller version of the ImageNet dataset then the one used in the paper.

In [2]:
## Load pretraining data
kaggle.api.authenticate()
# kaggle.api.dataset_download_files('ifigotin/imagenetmini-1000',
#                                 path='/home/misha/Desktop/data/yolo_paper/pretrain_data/image_data/',
#                                 unzip=True)



### Data Preprocessing

In [3]:
PRETRAIN_DATA_PATH = '/home/misha/Desktop/data/yolo_paper/pretrain_data/'
labels_txt = PRETRAIN_DATA_PATH + 'words.txt'

In [4]:
labels = pd.read_csv(labels_txt, sep='\t')

In [5]:
labels

Unnamed: 0,code,object
0,n00001740,entity
1,n00001930,physical entity
2,n00002137,"abstraction, abstract entity"
3,n00002452,thing
4,n00002684,"object, physical object"
...,...,...
82110,n15299225,study hall
82111,n15299367,"Transfiguration, Transfiguration Day, August 6"
82112,n15299585,usance
82113,n15299783,window


In [6]:
# labels[labels['code'] == 'n03485794214']['object'].values[0]

In [7]:
n_classes = len(os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/train/'))

In [8]:
n_classes

999

In [9]:
len(os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/val/'))

999

In [10]:
os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/val/') == os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/train/')

True

In [11]:
os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/')

['val', 'train']

#### Rename the folders to their label instead of the code

In [12]:
# for split in os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/'):
#     for folder in os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/' + split):
#         try:
#             label = labels[labels['code'] == folder]['object'].values[0]
#         except IndexError as err:
#             print(f"No label found for Split: {split} Folder: {folder}")
#             continue

#         ## Rename folder
#         source_path = PRETRAIN_DATA_PATH + f'image_data/imagenet-mini/{split}/{folder}'
#         destination_path = PRETRAIN_DATA_PATH + f'image_data/imagenet-mini/{split}/{label}'

#         try:
#             shutil.move(source_path, destination_path)
#         except OSError as err:
#             print(f"Error moving folder({folder}): {err}")

In [13]:
os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/val/') == os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/train/')

True

#### Move load data into Tensorflow datasets.

In [14]:
train_dir = pathlib.Path(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/train/')
val_dir = pathlib.Path(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/val/')

In [15]:
train_df = tf.keras.utils.image_dataset_from_directory(
    train_dir, 
    labels='inferred',
    color_mode='rgb',
    batch_size=32, 
    label_mode='categorical',   ## Vector Representation (Use categorical_crossentropy loss)
    image_size=(224, 224),
    crop_to_aspect_ratio=True,
    seed=1, 
    shuffle=True
)
val_df = tf.keras.utils.image_dataset_from_directory(
    val_dir, 
    labels='inferred',
    color_mode='rgb',
    batch_size=32, 
    label_mode='categorical',   ## Vector Representation (Use categorical_crossentropy loss)
    image_size=(224, 224),
    crop_to_aspect_ratio=True,
    seed=1, 
    shuffle=True
)

Found 34745 files belonging to 999 classes.


I0000 00:00:1737821254.121944    4468 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9617 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:01:00.0, compute capability: 8.6


Found 3923 files belonging to 999 classes.


#### Data augmentation

In [16]:
# data_augmentation = tf.keras.Sequential([
#     tf.keras.layers.RandomFlip('horizontal', seed=1),
#     tf.keras.layers.RandomRotation(0.2, seed=1)
# ])

In [17]:
# train_aug = train_df.map(lambda x, y: (data_augmentation(x), y))
# val_aug = val_df.map(lambda x, y: (data_augmentation(x), y))

In [18]:
# train_df = train_df.concatenate(train_aug).prefetch(1)
# val_df = val_df.concatenate(val_aug).prefetch(1)

In [19]:
# len(train_df) * 32

In [20]:
# len(val_df) * 32

### Pretraining

In [21]:
## Partial layers
# Conv2DLayer = partial(tf.keras.layers.Conv2D, strides=(1,1), padding='same',
#                       kernel_initializer='he_normal')
# MaxPoolLayer = partial(tf.keras.layers.MaxPool2D, pool_size=(2,2), strides=2, padding='same',)

In [22]:
# pretraining_model = tf.keras.Sequential()

# # Input layer (only for pretraining, will be removed when applied to full model)
# pretraining_model.add(tf.keras.layers.InputLayer(shape=(224,224,3)))

# ## First 20 layers according to paper
# pretraining_model.add(Conv2DLayer(filters=64, strides=(2,2), kernel_size=(7,7)))
# pretraining_model.add(MaxPoolLayer())
# pretraining_model.add(tf.keras.layers.BatchNormalization())

# pretraining_model.add(Conv2DLayer(filters=192, kernel_size=(3,3)))
# pretraining_model.add(MaxPoolLayer())
# pretraining_model.add(tf.keras.layers.BatchNormalization())

# pretraining_model.add(Conv2DLayer(filters=128, kernel_size=(1,1)))
# pretraining_model.add(Conv2DLayer(filters=256, kernel_size=(3,3)))
# pretraining_model.add(Conv2DLayer(filters=256, kernel_size=(1,1)))
# pretraining_model.add(Conv2DLayer(filters=512, kernel_size=(3,3)))
# pretraining_model.add(MaxPoolLayer())
# pretraining_model.add(tf.keras.layers.BatchNormalization())

# for _ in range(4):
#     pretraining_model.add(Conv2DLayer(filters=256, kernel_size=(1,1)))
#     pretraining_model.add(Conv2DLayer(filters=512, kernel_size=(3,3)))
#     pretraining_model.add(tf.keras.layers.BatchNormalization())
# pretraining_model.add(Conv2DLayer(filters=512, kernel_size=(1,1)))
# pretraining_model.add(Conv2DLayer(filters=1024, kernel_size=(3,3)))
# pretraining_model.add(MaxPoolLayer())
# pretraining_model.add(tf.keras.layers.BatchNormalization())

# ## Flatten Layer
# pretraining_model.add(tf.keras.layers.Flatten())
# pretraining_model.add(tf.keras.layers.BatchNormalization())

# ## Output layer for pretraining (will be removed when layers are reused)
# pretraining_model.add(tf.keras.layers.Dense(n_classes, activation='softmax'))

In [23]:
# pretraining_model.summary()

In [24]:
## Loaded model from checkpoint
# pretraining_model = tf.keras.models.load_model('best_pretrain_model.keras')

In [25]:
## Pretraining
# optimizer = tf.keras.optimizers.SGD(learning_rate=1e-6, momentum=0.9, nesterov=True)
# pretraining_model.compile(optimizer=optimizer,
#                           loss=tf.keras.losses.categorical_crossentropy,
#                           metrics=['accuracy'])

In [26]:
# checkpoints = tf.keras.callbacks.ModelCheckpoint('best_pretrain_model.keras', monitor='val_accuracy',
#                                                  verbose=1, save_best_only=True)
# tb_dir = 'logs/yolo/pretrain/' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=tb_dir)

# pretraining_model.fit(train_df, validation_data=val_df, epochs=25, 
#                      callbacks=[checkpoints, tensorboard_callback])

## Using transfer learning for pretraining

In order to save time I will just do transfer learning for this pretraining since it shouldn't affect the actual model a lot.

We will use EfficientNetV2M

### Model fine tune

In [27]:
preprocess_func = tf.keras.applications.efficientnet_v2.preprocess_input

train_df = train_df.map(lambda X, y: (preprocess_func(X), y)).prefetch(2)
val_df = val_df.map(lambda X, y: (preprocess_func(X), y)).prefetch(2)

In [28]:
base_pretrain_model = tf.keras.applications.efficientnet_v2.EfficientNetV2M(weights='imagenet', 
                                                                           include_top=False,
                                                                           input_shape=(224, 224, 3))
avg = tf.keras.layers.GlobalAveragePooling2D()(base_pretrain_model.output)
output = tf.keras.layers.Dense(n_classes, activation='softmax')(avg)
model = tf.keras.Model(inputs=base_pretrain_model.input, outputs=output)

In [29]:
for layer in base_pretrain_model.layers:
    layer.trainable = False

In [30]:
# optimizer = tf.keras.optimizers.AdamW(learning_rate=0.001)
# model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
# history = model.fit(train_df, validation_data=val_df, epochs=4)

In [31]:
# pd.DataFrame(history.history).plot()

In [32]:
len(base_pretrain_model.layers)

740

In [33]:
## Unfreeze some layers and retrain
for layer in base_pretrain_model.layers[:75]:
    layer.trainable = True

In [35]:
# optimizer = tf.keras.optimizers.AdamW(learning_rate=1e-5)
# model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', restore_best_weights=True, patience=2)

# history = model.fit(train_df, validation_data=val_df, epochs=10, callbacks=[early_stop])

In [36]:
model = tf.keras.models.load_model('pretrain_model.keras')

In [38]:
model.save('pretrain_model.keras', overwrite=True)

## YOLO Model

### Outline for building
- Get Dataset Pascal VOC or whatever is said in paper
- Build loss function
- Figure out infrence
- Build Model class
- Train

### Download and Clean Dataset

In [40]:
kaggle.api.dataset_download_files('gopalbhattrai/pascal-voc-2012-dataset',
                                 path='/home/misha/Desktop/data/yolo_paper/train_data',
                                 unzip=True)

Dataset URL: https://www.kaggle.com/datasets/gopalbhattrai/pascal-voc-2012-dataset


In [41]:
TRAIN_DATAPATH = '/home/misha/Desktop/data/yolo_paper/train_data/'

In [46]:
def get_center_coords(bbox):
    xmin, ymin, xmax, ymax = bbox
    center_x = (xmin + xmax) / 2
    center_y = (ymin + ymax) / 2
    return center_x, center_y

In [1]:
# def load_pascal_voc_data(image_dir, annotation_dir, image_set_file):
    images = []
    bboxes = []
    labels = []

    with open(image_set_file, 'r') as f:
        ## Get list of image ids for the split
        image_ids = [line.strip() for line in f]

    for image_id in image_ids:
        ## For each id, get corresponding image and annotation file
        image_path = os.path.join(image_dir, f"{image_id}.jpg")
        annotation_path = os.path.join(annotation_dir, f"{image_id}.xml")

        ## Load the image
        image = tf.keras.preprocessing.image.load_img(image_path)
        image = tf.keras.preprocessing.image.img_to_array(image)

        ## Parse the XML file
        tree = ET.parse(annotation_path)
        root = tree.getroot()
        for obj in root.findall('object'):
            label = obj.find('name')
            bbox = obj.find('bndbox')
            xmin = int(bbox.find('xmin').text)
            ymin = int(bbox.find('ymin').text)
            xmax = int(bbox.find('xmax').text)
            ymax = int(bbox.find('ymax').text)

            ## Get actual features
            center_x, center_y = get_center_coords([xmin, ymin, xmax, ymax])
            # Calculate width and height
            width = xmax - xmin
            height = ymax - ymin

            bboxes.append([center_x, center_y, width, height])
            labels.append(label)

        images.append(image)

    return images, bboxes, labels

IndentationError: unexpected indent (1920750134.py, line 2)

In [None]:
image_dir = '/home/misha/Desktop/data/yolo_paper/train_data/VOC2012_train_val/VOC2012_train_val/JPEGImages'
annotation_dir = '/home/misha/Desktop/data/yolo_paper/train_data/VOC2012_train_val/VOC2012_train_val/Annotations'
image_set_file = '/home/misha/Desktop/data/yolo_paper/train_data/VOC2012_train_val/VOC2012_train_val/ImageSets/Main/train.txt'

images, bboxes, labels = load_pascal_voc_data(image_dir, annotation_dir, image_set_file)

train_df = tf.data.Dataset.from_tensor_slices((images, bboxes, labels))

### Build Loss Function

In [1]:
class YOLOLoss(tf.keras.losses.Loss):
    

SyntaxError: incomplete input (394256117.py, line 2)