In [17]:
import tensorflow as tf
import tensorflow_datasets as tfds
import sys
import torch 
from torchvision import datasets, transforms


sys.setrecursionlimit(3000)  # Set recursion limit to a higher value


from PIL import Image

# Custom transformation to resize images and ensure they are of the same size
class ResizeAndPad:
    def __init__(self, target_size):
        self.target_size = target_size

    def __call__(self, image):
        width, height = image.size
        if width < self.target_size[0] or height < self.target_size[1]:
            # Resize the image to fit within the target size while maintaining aspect ratio
            ratio = min(self.target_size[0] / width, self.target_size[1] / height)
            new_width = int(width * ratio)
            new_height = int(height * ratio)
            image = image.resize((new_width, new_height), Image.ANTIALIAS)

        # Create a new blank image of the target size
        new_image = Image.new("RGB", self.target_size)
        # Paste the resized image onto the blank image, centered
        position = ((self.target_size[0] - new_width) // 2, (self.target_size[1] - new_height) // 2)
        new_image.paste(image, position)
        
        return new_image

# Define transform with resizing to ensure consistent dimensions
transform = transforms.Compose([
    ResizeAndPad((375, 500)),  # Resize and pad images to a fixed size
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


# Download and load the PASCAL VOC dataset
train_dataset = datasets.VOCDetection(root='./data', year='2012', image_set='train', download=True, transform=transform)
test_dataset = datasets.VOCDetection(root='./data', year='2012', image_set='val', download=True, transform=transform)

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)










Using downloaded and verified file: ./data/VOCtrainval_11-May-2012.tar
Extracting ./data/VOCtrainval_11-May-2012.tar to ./data
Using downloaded and verified file: ./data/VOCtrainval_11-May-2012.tar
Extracting ./data/VOCtrainval_11-May-2012.tar to ./data


In [18]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy, MeanSquaredError

# Load pre-trained ResNet50 model (excluding top layers)
base_model = ResNet50(weights='/Users/parthmalani/Downloads/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5', include_top=False, input_shape=(224, 224, 3))

# Freeze the convolutional layers to prevent them from being trained
base_model.trainable = False

# Modify the final layers for object detection
flatten = Flatten()(base_model.output)
dense1 = Dense(1024, activation='relu')(flatten)
dropout = Dropout(0.5)(dense1)
dense2 = Dense(256, activation='relu')(dropout)
output_boxes = Dense(4, name='bounding_box')(dense2)  # Output bounding box coordinates (x, y, w, h)
output_classes = Dense(80, activation='softmax', name='class_label')(dense2)  # Output class labels (80 COCO classes)

# Create the model
model = Model(inputs=base_model.input, outputs=[output_boxes, output_classes])

In [19]:
import numpy as np

# Generate random training data
num_train_samples = 1000
train_images = np.random.rand(num_train_samples, 224, 224, 3)  # Random images (1000 samples, 224x224 pixels, 3 channels)
train_boxes = np.random.rand(num_train_samples, 4)  # Random bounding boxes (1000 samples, 4 coordinates: x, y, width, height)
train_labels = np.random.randint(0, 80, size=num_train_samples)  # Random class labels (1000 samples, 80 classes)

# Generate random validation data
num_val_samples = 200
val_images = np.random.rand(num_val_samples, 224, 224, 3)  # Random images (200 samples, 224x224 pixels, 3 channels)
val_boxes = np.random.rand(num_val_samples, 4)  # Random bounding boxes (200 samples, 4 coordinates: x, y, width, height)
val_labels = np.random.randint(0, 80, size=num_val_samples)  # Random class labels (200 samples, 80 classes)

# Create TensorFlow Dataset objects for training and validation
train_dataset = tf.data.Dataset.from_tensor_slices((train_images, {'bounding_box': train_boxes, 'class_label': train_labels}))
val_dataset = tf.data.Dataset.from_tensor_slices((val_images, {'bounding_box': val_boxes, 'class_label': val_labels}))

# Apply batching, shuffling, and prefetching
batch_size = 32
train_dataset = train_dataset.shuffle(buffer_size=1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [20]:




# Compile the model
model.compile(optimizer=Adam(), 
              loss={'bounding_box': MeanSquaredError(), 'class_label': SparseCategoricalCrossentropy()},
              metrics={'bounding_box': 'accuracy', 'class_label': 'accuracy'})

# Define training parameters
epochs = 10

# Train the model
history = model.fit(train_dataset, 
                    epochs=epochs, 
                    batch_size=batch_size, 
                    validation_data=val_dataset)

# Optionally, validate the model on a separate validation set
val_metrics = model.evaluate(val_dataset)

# Extract the validation loss and accuracy from the returned metrics
val_loss, val_acc = val_metrics[0], val_metrics[1]


# Once training is complete, you can save the model if needed
model.save('object_detection_model.h5')

Epoch 1/10


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 5s/step - bounding_box_accuracy: 0.2426 - class_label_accuracy: 0.0133 - loss: 150.8083 - val_bounding_box_accuracy: 0.2650 - val_class_label_accuracy: 0.0100 - val_loss: 4.6822
Epoch 2/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 5s/step - bounding_box_accuracy: 0.2657 - class_label_accuracy: 0.0205 - loss: 4.9148 - val_bounding_box_accuracy: 0.2300 - val_class_label_accuracy: 0.0100 - val_loss: 4.5194
Epoch 3/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 2s/step - bounding_box_accuracy: 0.2664 - class_label_accuracy: 0.0115 - loss: 4.5703 - val_bounding_box_accuracy: 0.2750 - val_class_label_accuracy: 0.0150 - val_loss: 4.5138
Epoch 4/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 3s/step - bounding_box_accuracy: 0.2292 - class_label_accuracy: 0.0096 - loss: 4.5310 - val_bounding_box_accuracy: 0.2450 - val_class_label_accuracy: 0.0150 - val_loss: 4.5014

