In [1]:
import shutil
import os
import pathlib
import numpy as np
import pandas as pd
import plotly.express as px
import kaggle
import tensorflow as tf

from functools import partial



2025-01-23 14:12:38.051769: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-23 14:12:38.060891: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737663158.071768    8660 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737663158.075124    8660 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-23 14:12:38.086535: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

# Implmentation Outline
- Do pretraining for image classification to learn features
- Convert model to object detection
- Make custom loss function
- Get object detection dataset
- Train according to paper specifications
- Test model

## Pretraining

For pretraining I will use a smaller version of the ImageNet dataset then the one used in the paper.

In [2]:
## Load pretraining data
kaggle.api.authenticate()
# kaggle.api.dataset_download_files('ifigotin/imagenetmini-1000',
#                                 path='/home/misha/Desktop/data/yolo_paper/pretrain_data/image_data/',
#                                 unzip=True)



### Data Preprocessing

In [3]:
PRETRAIN_DATA_PATH = '/home/misha/Desktop/data/yolo_paper/pretrain_data/'
labels_txt = PRETRAIN_DATA_PATH + 'words.txt'

In [4]:
labels = pd.read_csv(labels_txt, sep='\t')

In [5]:
labels

Unnamed: 0,code,object
0,n00001740,entity
1,n00001930,physical entity
2,n00002137,"abstraction, abstract entity"
3,n00002452,thing
4,n00002684,"object, physical object"
...,...,...
82110,n15299225,study hall
82111,n15299367,"Transfiguration, Transfiguration Day, August 6"
82112,n15299585,usance
82113,n15299783,window


In [6]:
# labels[labels['code'] == 'n03485794214']['object'].values[0]

In [7]:
n_classes = len(os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/train/'))

In [8]:
n_classes

999

In [9]:
len(os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/val/'))

999

In [10]:
os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/val/') == os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/train/')

True

In [11]:
os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/')

['val', 'train']

#### Rename the folders to their label instead of the code

In [12]:
# for split in os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/'):
#     for folder in os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/' + split):
#         try:
#             label = labels[labels['code'] == folder]['object'].values[0]
#         except IndexError as err:
#             print(f"No label found for Split: {split} Folder: {folder}")
#             continue

#         ## Rename folder
#         source_path = PRETRAIN_DATA_PATH + f'image_data/imagenet-mini/{split}/{folder}'
#         destination_path = PRETRAIN_DATA_PATH + f'image_data/imagenet-mini/{split}/{label}'

#         try:
#             shutil.move(source_path, destination_path)
#         except OSError as err:
#             print(f"Error moving folder({folder}): {err}")

In [13]:
os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/val/') == os.listdir(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/train/')

True

#### Move load data into Tensorflow datasets.

In [14]:
train_dir = pathlib.Path(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/train/')
val_dir = pathlib.Path(PRETRAIN_DATA_PATH + 'image_data/imagenet-mini/val/')

In [15]:
train_df = tf.keras.utils.image_dataset_from_directory(
    train_dir, 
    labels='inferred',
    color_mode='rgb',
    batch_size=32, 
    label_mode='categorical',   ## Vector Representation (Use categorical_crossentropy loss)
    image_size=(224, 224),
    crop_to_aspect_ratio=True,
    seed=1, 
    shuffle=True
)
val_df = tf.keras.utils.image_dataset_from_directory(
    val_dir, 
    labels='inferred',
    color_mode='rgb',
    batch_size=32, 
    label_mode='categorical',   ## Vector Representation (Use categorical_crossentropy loss)
    image_size=(224, 224),
    crop_to_aspect_ratio=True,
    seed=1, 
    shuffle=True
)

Found 34745 files belonging to 999 classes.


I0000 00:00:1737663160.167807    8660 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9521 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:01:00.0, compute capability: 8.6


Found 3923 files belonging to 999 classes.


#### Data augmentation

In [16]:
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip('horizontal', seed=1),
    tf.keras.layers.RandomRotation(0.2, seed=1)
])

In [17]:
train_aug = train_df.map(lambda x, y: (data_augmentation(x), y))
val_aug = val_df.map(lambda x, y: (data_augmentation(x), y))

In [18]:
train_df = train_df.concatenate(train_aug).prefetch(1)
val_df = val_df.concatenate(val_aug).prefetch(1)

In [19]:
len(train_df) * 32

69504

In [20]:
len(val_df) * 32

7872

### Pretraining

In [21]:
## Partial layers
Conv2DLayer = partial(tf.keras.layers.Conv2D, strides=(1,1), padding='same')
MaxPoolLayer = partial(tf.keras.layers.MaxPool2D, pool_size=(2,2), strides=2, padding='same')

In [22]:
pretraining_model = tf.keras.Sequential()

# Input layer (only for pretraining, will be removed when applied to full model)
pretraining_model.add(tf.keras.layers.InputLayer(shape=(224,224,3)))

## First 20 layers according to paper
pretraining_model.add(Conv2DLayer(filters=64, strides=(2,2), kernel_size=(7,7)))
pretraining_model.add(MaxPoolLayer())

pretraining_model.add(Conv2DLayer(filters=192, kernel_size=(3,3)))
pretraining_model.add(MaxPoolLayer())

pretraining_model.add(Conv2DLayer(filters=128, kernel_size=(1,1)))
pretraining_model.add(Conv2DLayer(filters=256, kernel_size=(3,3)))
pretraining_model.add(Conv2DLayer(filters=256, kernel_size=(1,1)))
pretraining_model.add(Conv2DLayer(filters=512, kernel_size=(3,3)))
pretraining_model.add(MaxPoolLayer())

for _ in range(4):
    pretraining_model.add(Conv2DLayer(filters=256, kernel_size=(1,1)))
    pretraining_model.add(Conv2DLayer(filters=512, kernel_size=(3,3)))
pretraining_model.add(Conv2DLayer(filters=512, kernel_size=(1,1)))
pretraining_model.add(Conv2DLayer(filters=1024, kernel_size=(3,3)))
pretraining_model.add(MaxPoolLayer())

## Flatten Layer
pretraining_model.add(tf.keras.layers.Flatten())

## Output layer for pretraining (will be removed when layers are reused)
pretraining_model.add(tf.keras.layers.Dense(n_classes, activation='softmax', kernel_initializer='he_normal'))

In [23]:
pretraining_model.summary()

In [24]:
## Pretraining
optimizer = tf.keras.optimizers.Nadam(learning_rate=0.01)
pretraining_model.compile(optimizer=optimizer,
                          loss=tf.keras.losses.categorical_crossentropy,
                          metrics=['accuracy'])

In [25]:
checkpoints = tf.keras.callbacks.ModelCheckpoint('best_pretrain_model.keras', monitor='val_accuracy',
                                                 verbose=1, save_best_only=True)

pretraining_model.fit(train_df, validation_data=val_df, epochs=25, 
                     callbacks=[checkpoints])

Epoch 1/25


I0000 00:00:1737663164.666997    8800 service.cc:148] XLA service 0x76f088001ae0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1737663164.667037    8800 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2025-01-23 14:12:44.834488: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1737663165.013986    8800 cuda_dnn.cc:529] Loaded cuDNN version 90300







[1m   2/2172[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:12[0m 89ms/step - accuracy: 0.0000e+00 - loss: 258286784.0000

I0000 00:00:1737663174.755262    8800 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1085/2172[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m1:30[0m 83ms/step - accuracy: 0.0014 - loss: nan                 








[1m1143/2172[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m1:36[0m 94ms/step - accuracy: 0.0014 - loss: nan

KeyboardInterrupt: 