In [1]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118/torch_stable.html

In [1]:
import json
import time
import torch
import os
from PIL import Image
from pycocotools.coco import COCO

device = "cuda:0" if torch.cuda.is_available() else "cpu"

categories = [
        {
            "id": 1,
            "name": "person",
            "supercategory": ""
        },
        {
            "id": 2,
            "name": "bicycle",
            "supercategory": ""
        },
        {
            "id": 3,
            "name": "car",
            "supercategory": ""
        },
        {
            "id": 4,
            "name": "motorcycle",
            "supercategory": ""
        },
        {
            "id": 6,
            "name": "bus",
            "supercategory": ""
        },
        {
            "id": 8,
            "name": "truck",
            "supercategory": ""
        }
    ]

<h2>DETR</h2>

In [2]:
from transformers import AutoImageProcessor, DetrForObjectDetection

image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

# Change model to eval mode and onto the GPU
model.eval()
model.to(device)

# Creating a function to infer images from a directory and return the predictions in a single coco eval json file
def infer_images_from_directory(model, image_processor, img_directory, output_file, size=(3840, 2160)):
    """
        Function to infer images from a directory and return the predictions in a single coco eval json file

        Args:
            model: The model to be used for inference
            image_processor: The image processor
            img_directory: The directory containing the images
            output_file: The output file where the predictions will be saved
            size: The size to which the images will be resized

        Returns:
            coco_output: The predictions in COCO eval json format
    """
    # Loading the images from the directory
    images = []
    for img in os.listdir(img_directory):
        images.append(img)

    coco_annotations = []
    coco_images = []

    for image_name in images:
        # Loading the image
        image = Image.open(os.path.join(img_directory, image_name))
        
        inputs = image_processor(images=image, return_tensors="pt").to(device)
        
        # Move the inputs to the device and define by batch size
        inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

        outputs = model(**inputs)

        # converting outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        target_sizes = torch.tensor([image.size[::-1]]).to(device)

        results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[0]
        
        image_id = int(image_name.split('_')[-1].split('.')[0])

        # Appending the image info to the coco_images list
        coco_images.append({
            'file_name': image_name,
            'height': image.size[1],
            'width': image.size[0],
            'id': image_id
        })

        # Appending the predictions to the coco_annotations list
        for i in range(len(results['labels'])):
            coco_annotations.append({
                'id': len(coco_annotations) + 1,
                'image_id': image_id,
                'category_id': int(results['labels'][i]),
                'bbox': [float(results['boxes'][i][0]), float(results['boxes'][i][1]), float(results['boxes'][i][2] - results['boxes'][i][0]), float(results['boxes'][i][3] - results['boxes'][i][1])],
                'score': float(results['scores'][i]),
                'area': float(results['boxes'][i][2] * results['boxes'][i][3]),  # Assuming the area is width * height
                'segmentation': [], # Assuming the segmentation is empty
                'iscrowd': 0
            })

        if len(coco_images) == 100:  # Limiting the number of images to 100 
            break
    
    # Removing annotations which don't have a category_id in the categories list
    coco_annotations = [annotation for annotation in coco_annotations if annotation['category_id'] in [category['id'] for category in categories]]
    
    # Saving the predictions to COCO eval json file
    coco_output = {
        'categories': categories,  # Assuming 90 categories
        'images': coco_images,
        'annotations': coco_annotations,
    }

    # Creating directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    with open(output_file, 'w') as f:
        json.dump(coco_output, f, indent=4)

    return coco_output

# Example usage
predictions_json = 'predictions/DETR/predictions_2160p.json'
images_path = 'frames/2160p'
size = (3840, 2160)

start = time.time()
infer_images_from_directory(model, image_processor, images_path, predictions_json, size)
end = time.time()

# Saving the time taken to infer the images
with open('predictions/DETR/time_taken.txt', 'w') as f:
    f.write(f'Time taken to infer images: {end - start} seconds')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  return F.conv2d(input, weight, b

<h2>Grounding DINO - Zero Shot Detection - Takes some time and not picking up cars</h2>

In [3]:
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

model_id = "IDEA-Research/grounding-dino-tiny"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id)

# Change model to eval mode and onto the GPU
model.eval()
model.to(device)

# Creating a function to infer images from a directory and return the predictions in a single coco eval json file
def infer_images_from_directory(model, image_processor, img_directory, output_file, size=(3840, 2160)):
    """
        Function to infer images from a directory and return the predictions in a single coco eval json file

        Args:
            model: The model to be used for inference
            image_processor: The image processor
            img_directory: The directory containing the images
            output_file: The output file where the predictions will be saved
            size: The size to which the images will be resized

        Returns:
            coco_output: The predictions in COCO eval json format
    """
    # Loading the images from the directory
    images = []
    for img in os.listdir(img_directory):
        images.append(img)

    coco_annotations = []
    coco_images = []

    text =" a person. a bicycle. a car. a motorcycle. a bus. a truck."

    for image_name in images:
        # Loading the image
        image = Image.open(os.path.join(img_directory, image_name))
        
        inputs = image_processor(images=image, text=text, return_tensors="pt").to(device)
        
        # Move the inputs to the device and define by batch size
        # inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

        outputs = model(**inputs)

        # converting outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        target_sizes = torch.tensor([image.size[::-1]]).to(device)

        results = image_processor.post_process_grounded_object_detection(outputs, inputs.input_ids, box_threshold=0.4, text_threshold=0.3, target_sizes=target_sizes)[0]
        
        image_id = int(image_name.split('_')[-1].split('.')[0])

        # Appending the image info to the coco_images list
        coco_images.append({
            'file_name': image_name,
            'height': image.size[1],
            'width': image.size[0],
            'id': image_id
        })

        # Create a mapping from string labels to integer category IDs
        label_to_category_id = {category['name']: category['id'] for category in categories}

        # Appending the predictions to the coco_annotations list
        for i in range(len(results['labels'])):
            # Retrieving initial category_id from the labels
            category_id = label_to_category_id['car']  # Default category_id
            # Search through the labels whether there is a token that is in the label
            for label in results['labels'][i]:
                if label in label_to_category_id:
                    category_id = label_to_category_id[label]
                    break
            print(results['labels'], category_id)
            coco_annotations.append({
                'id': len(coco_annotations) + 1,
                'image_id': image_id,
                'category_id': category_id,
                'bbox': [float(results['boxes'][i][0]), float(results['boxes'][i][1]), float(results['boxes'][i][2] - results['boxes'][i][0]), float(results['boxes'][i][3] - results['boxes'][i][1])],
                'score': float(results['scores'][i]),
                'area': float(results['boxes'][i][2] * results['boxes'][i][3]),  # Assuming the area is width * height
                'segmentation': [], # Assuming the segmentation is empty
                'iscrowd': 0
            })

        if len(coco_images) == 100:  # Limiting the number of images to 100 
            break
    
    # Removing annotations which don't have a category_id in the categories list
    coco_annotations = [annotation for annotation in coco_annotations if annotation['category_id'] in [category['id'] for category in categories]]
    
    # Saving the predictions to COCO eval json file
    coco_output = {
        'categories': categories,  # Assuming 90 categories
        'images': coco_images,
        'annotations': coco_annotations,
    }

    # Creating directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    with open(output_file, 'w') as f:
        json.dump(coco_output, f, indent=4)

    return coco_output

# Example usage
predictions_json = 'predictions/GroundingDINO/predictions_2160p.json'
images_path = 'frames/2160p'
size = (3840, 2160)

start = time.time()
infer_images_from_directory(model, processor, images_path, predictions_json, size)
end = time.time()

# Saving the time taken to infer the images
with open('predictions/GroundingDINO/time_taken.txt', 'w') as f:
    f.write(f'Time taken to infer images: {end - start} seconds')


['a bicycle a motorcycle', 'a bus'] 3
['a bicycle a motorcycle', 'a bus'] 3
['a bicycle a motorcycle', 'a bus'] 3
['a bicycle a motorcycle', 'a bus'] 3
['a motorcycle', 'a bus'] 3
['a motorcycle', 'a bus'] 3
['a bus', 'a a motorcycle'] 3
['a bus', 'a a motorcycle'] 3
['a bus', 'a bicycle a motorcycle'] 3
['a bus', 'a bicycle a motorcycle'] 3
['a bus', 'a bicycle a motorcycle'] 3
['a bus', 'a bicycle a motorcycle'] 3


KeyboardInterrupt: 

<h2>RetinaNet</h2>

In [7]:
# Loading the retinanet model from pytorch hub
import torchvision
from torchvision.models.detection.retinanet import RetinaNet
from torchvision.models.detection.retinanet import RetinaNet_ResNet50_FPN_Weights

model = torchvision.models.detection.retinanet_resnet50_fpn(weights=RetinaNet_ResNet50_FPN_Weights.DEFAULT)
model.eval()

# model.to(device)

# Defining a transform to resize the image
transform = torchvision.transforms.Compose([
    # torchvision.transforms.Resize((800, 800)),
    torchvision.transforms.ToTensor()
])


# Creating a function to infer images from a directory and return the predictions in a single coco eval json file
def pytorch_infer_images_from_directory(model, transform_function, img_directory, output_file):
    """
        Function to infer images from a directory and return the predictions in a single coco eval json file

        Args:
            model: The model to be used for inference
            transform_function: The transform function to be applied to the images
            img_directory: The directory containing the images
            output_file: The output file where the predictions will be saved

        Returns:
            coco_output: The predictions in COCO eval json format
    """
    # Loading the images from the directory
    images = []
    for img in os.listdir(img_directory):
        images.append(img)

    coco_annotations = []
    coco_images = []

    for image_name in images:
        # Loading the image
        image = Image.open(os.path.join(img_directory, image_name))
        old_image = image.copy()

        # Applying the transform to the image
        image = transform_function(image)

        inputs = [image]

        # inputs = list(img.to(device) for img in image)

        # Getting the predictions
        with torch.no_grad():
            predictions = model(inputs)
            results = predictions[0]

        image_id = int(image_name.split('_')[-1].split('.')[0])

        # Removing predictions with score less than threshold
        threshold = 0.5
        results = {key: value[results['scores'] > threshold] for key, value in results.items()}

        # Appending the image info to the coco_images list
        coco_images.append({
            'file_name': image_name,
            'height': old_image.size[1],
            'width': old_image.size[0],
            'id': image_id
        })

        # Appending the predictions to the coco_annotations list
        for i in range(len(results['labels'])):
            coco_annotations.append({
                'id': len(coco_annotations) + 1,
                'image_id': image_id,
                'category_id': int(results['labels'][i]),
                'bbox': [float(results['boxes'][i][0]), float(results['boxes'][i][1]), float(results['boxes'][i][2] - results['boxes'][i][0]), float(results['boxes'][i][3] - results['boxes'][i][1])],
                'score': float(results['scores'][i]),
                'area': float((results['boxes'][i][2] - results['boxes'][i][0]) * (results['boxes'][i][3] - results['boxes'][i][1]).item()),  # Assuming the bbox format is [xmin, ymin, xmax, ymax]                'iscrowd': 0
                'segmentation': [], # Assuming the segmentation is empty
                'iscrowd': 0
            })

        if len(coco_images) == 100:  # Limiting the number of images to 100 
            break
    
    # Removing annotations which don't have a category_id in the categories list
    coco_annotations = [annotation for annotation in coco_annotations if annotation['category_id'] in [category['id'] for category in categories]]
    
    # Saving the predictions to COCO eval json file
    coco_output = {
        'categories': categories,  # Assuming 90 categories
        'images': coco_images,
        'annotations': coco_annotations,
    }

    # Creating directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    with open(output_file, 'w') as f:
        json.dump(coco_output, f, indent=4)

    return coco_output

# Example usage
predictions_json = 'predictions/RetinaNet/predictions_2160p.json'
images_path = 'frames/2160p'

start = time.time()
pytorch_infer_images_from_directory(model, transform, images_path, predictions_json)
end = time.time()

# Saving the time taken to infer the images
with open('predictions/RetinaNet/time_taken.txt', 'w') as f:
    f.write(f'Time taken to infer images: {end - start} seconds')

<h2>Faster R-CNN</h2>

In [2]:
# Loading the retinanet model from pytorch hub
import torchvision
from torchvision.models.detection.faster_rcnn import FasterRCNN
from torchvision.models.detection.faster_rcnn import FasterRCNN_ResNet50_FPN_Weights

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
model.eval()

# model.to(device)

# Defining a transform to resize the image
transform = torchvision.transforms.Compose([
    # torchvision.transforms.Resize((800, 800)),
    torchvision.transforms.ToTensor()
])


# Creating a function to infer images from a directory and return the predictions in a single coco eval json file
def pytorch_infer_images_from_directory(model, transform_function, img_directory, output_file):
    """
        Function to infer images from a directory and return the predictions in a single coco eval json file

        Args:
            model: The model to be used for inference
            transform_function: The transform function to be applied to the images
            img_directory: The directory containing the images
            output_file: The output file where the predictions will be saved

        Returns:
            coco_output: The predictions in COCO eval json format
    """
    # Loading the images from the directory
    images = []
    for img in os.listdir(img_directory):
        images.append(img)

    coco_annotations = []
    coco_images = []

    for image_name in images:
        # Loading the image
        image = Image.open(os.path.join(img_directory, image_name))
        old_image = image.copy()

        # Applying the transform to the image
        image = transform_function(image)

        inputs = [image]

        # inputs = list(img.to(device) for img in image)

        # Getting the predictions
        with torch.no_grad():
            predictions = model(inputs)
            results = predictions[0]

        image_id = int(image_name.split('_')[-1].split('.')[0])

        # Removing predictions with score less than threshold
        threshold = 0.5
        results = {key: value[results['scores'] > threshold] for key, value in results.items()}

        # Appending the image info to the coco_images list
        coco_images.append({
            'file_name': image_name,
            'height': old_image.size[1],
            'width': old_image.size[0],
            'id': image_id
        })

        # Appending the predictions to the coco_annotations list
        for i in range(len(results['labels'])):
            coco_annotations.append({
                'id': len(coco_annotations) + 1,
                'image_id': image_id,
                'category_id': int(results['labels'][i]),
                'bbox': [float(results['boxes'][i][0]), float(results['boxes'][i][1]), float(results['boxes'][i][2] - results['boxes'][i][0]), float(results['boxes'][i][3] - results['boxes'][i][1])],
                'score': float(results['scores'][i]),
                'area': float((results['boxes'][i][2] - results['boxes'][i][0]) * (results['boxes'][i][3] - results['boxes'][i][1]).item()),  # Assuming the bbox format is [xmin, ymin, xmax, ymax]                'iscrowd': 0
                'segmentation': [], # Assuming the segmentation is empty
                'iscrowd': 0
            })

        if len(coco_images) == 100:  # Limiting the number of images to 100 
            break
    
    # Removing annotations which don't have a category_id in the categories list
    coco_annotations = [annotation for annotation in coco_annotations if annotation['category_id'] in [category['id'] for category in categories]]
    
    # Saving the predictions to COCO eval json file
    coco_output = {
        'categories': categories,  # Assuming 90 categories
        'images': coco_images,
        'annotations': coco_annotations,
    }

    # Creating directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    with open(output_file, 'w') as f:
        json.dump(coco_output, f, indent=4)

    return coco_output

# Example usage
predictions_json = 'predictions/FasterRCNN/predictions_2160p.json'
images_path = 'frames/2160p'

start = time.time()
pytorch_infer_images_from_directory(model, transform, images_path, predictions_json)
end = time.time()

# Saving the time taken to infer the images
with open('predictions/FasterRCNN/time_taken.txt', 'w') as f:
    f.write(f'Time taken to infer images: {end - start} seconds')

  from .autonotebook import tqdm as notebook_tqdm


<h2>FCOS</h2>

In [4]:
# Loading the retinanet model from pytorch hub
import torchvision
from torchvision.models.detection.fcos import FCOS
from torchvision.models.detection.fcos import FCOS_ResNet50_FPN_Weights

model = torchvision.models.detection.fcos_resnet50_fpn(weights=FCOS_ResNet50_FPN_Weights.DEFAULT)
model.eval()

# model.to(device)

# Defining a transform to resize the image
transform = torchvision.transforms.Compose([
    # torchvision.transforms.Resize((800, 800)),
    torchvision.transforms.ToTensor()
])


# Creating a function to infer images from a directory and return the predictions in a single coco eval json file
def pytorch_infer_images_from_directory(model, transform_function, img_directory, output_file):
    """
        Function to infer images from a directory and return the predictions in a single coco eval json file

        Args:
            model: The model to be used for inference
            transform_function: The transform function to be applied to the images
            img_directory: The directory containing the images
            output_file: The output file where the predictions will be saved

        Returns:
            coco_output: The predictions in COCO eval json format
    """
    # Loading the images from the directory
    images = []
    for img in os.listdir(img_directory):
        images.append(img)

    coco_annotations = []
    coco_images = []

    for image_name in images:
        # Loading the image
        image = Image.open(os.path.join(img_directory, image_name))
        old_image = image.copy()

        # Applying the transform to the image
        image = transform_function(image)

        inputs = [image]

        # inputs = list(img.to(device) for img in image)

        # Getting the predictions
        with torch.no_grad():
            predictions = model(inputs)
            results = predictions[0]

        image_id = int(image_name.split('_')[-1].split('.')[0])

        # Removing predictions with score less than threshold
        threshold = 0.5
        results = {key: value[results['scores'] > threshold] for key, value in results.items()}

        # Appending the image info to the coco_images list
        coco_images.append({
            'file_name': image_name,
            'height': old_image.size[1],
            'width': old_image.size[0],
            'id': image_id
        })

        # Appending the predictions to the coco_annotations list
        for i in range(len(results['labels'])):
            coco_annotations.append({
                'id': len(coco_annotations) + 1,
                'image_id': image_id,
                'category_id': int(results['labels'][i]),
                'bbox': [float(results['boxes'][i][0]), float(results['boxes'][i][1]), float(results['boxes'][i][2] - results['boxes'][i][0]), float(results['boxes'][i][3] - results['boxes'][i][1])],
                'score': float(results['scores'][i]),
                'area': float((results['boxes'][i][2] - results['boxes'][i][0]) * (results['boxes'][i][3] - results['boxes'][i][1]).item()),  # Assuming the bbox format is [xmin, ymin, xmax, ymax]                'iscrowd': 0
                'segmentation': [], # Assuming the segmentation is empty
                'iscrowd': 0
            })

        if len(coco_images) == 100:  # Limiting the number of images to 100 
            break
    
    # Removing annotations which don't have a category_id in the categories list
    coco_annotations = [annotation for annotation in coco_annotations if annotation['category_id'] in [category['id'] for category in categories]]
    
    # Saving the predictions to COCO eval json file
    coco_output = {
        'categories': categories,  # Assuming 90 categories
        'images': coco_images,
        'annotations': coco_annotations,
    }

    # Creating directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    with open(output_file, 'w') as f:
        json.dump(coco_output, f, indent=4)

    return coco_output

# Example usage
predictions_json = 'predictions/FCOS/predictions_2160p.json'
images_path = 'frames/2160p'

start = time.time()
pytorch_infer_images_from_directory(model, transform, images_path, predictions_json)
end = time.time()

# Saving the time taken to infer the images
with open('predictions/FCOS/time_taken.txt', 'w') as f:
    f.write(f'Time taken to infer images: {end - start} seconds')

Downloading: "https://download.pytorch.org/models/fcos_resnet50_fpn_coco-99b0c9b7.pth" to C:\Users\User/.cache\torch\hub\checkpoints\fcos_resnet50_fpn_coco-99b0c9b7.pth
100%|██████████| 124M/124M [00:19<00:00, 6.57MB/s] 


<h2>SSD</h2>

In [4]:
# Loading the retinanet model from pytorch hub
import torchvision
from torchvision.models.detection.ssd import SSD
from torchvision.models.detection.ssd import SSD300_VGG16_Weights

model = torchvision.models.detection.ssd300_vgg16(weights=SSD300_VGG16_Weights.DEFAULT)
model.eval()

# model.to(device)

# Defining a transform to resize the image
transform = torchvision.transforms.Compose([
    # torchvision.transforms.Resize((800, 800)),
    torchvision.transforms.ToTensor()
])


# Creating a function to infer images from a directory and return the predictions in a single coco eval json file
def pytorch_infer_images_from_directory(model, transform_function, img_directory, output_file):
    """
        Function to infer images from a directory and return the predictions in a single coco eval json file

        Args:
            model: The model to be used for inference
            transform_function: The transform function to be applied to the images
            img_directory: The directory containing the images
            output_file: The output file where the predictions will be saved

        Returns:
            coco_output: The predictions in COCO eval json format
    """
    # Loading the images from the directory
    images = []
    for img in os.listdir(img_directory):
        images.append(img)

    coco_annotations = []
    coco_images = []

    for image_name in images:
        # Loading the image
        image = Image.open(os.path.join(img_directory, image_name))
        old_image = image.copy()

        # Applying the transform to the image
        image = transform_function(image)

        inputs = [image]

        # inputs = list(img.to(device) for img in image)

        # Getting the predictions
        with torch.no_grad():
            predictions = model(inputs)
            results = predictions[0]

        image_id = int(image_name.split('_')[-1].split('.')[0])

        # Removing predictions with score less than threshold
        threshold = 0.3
        results = {key: value[results['scores'] > threshold] for key, value in results.items()}

        # Appending the image info to the coco_images list
        coco_images.append({
            'file_name': image_name,
            'height': old_image.size[1],
            'width': old_image.size[0],
            'id': image_id
        })

        # Appending the predictions to the coco_annotations list
        for i in range(len(results['labels'])):
            coco_annotations.append({
                'id': len(coco_annotations) + 1,
                'image_id': image_id,
                'category_id': int(results['labels'][i]),
                'bbox': [float(results['boxes'][i][0]), float(results['boxes'][i][1]), float(results['boxes'][i][2] - results['boxes'][i][0]), float(results['boxes'][i][3] - results['boxes'][i][1])],
                'score': float(results['scores'][i]),
                'area': float((results['boxes'][i][2] - results['boxes'][i][0]) * (results['boxes'][i][3] - results['boxes'][i][1]).item()),  # Assuming the bbox format is [xmin, ymin, xmax, ymax]                'iscrowd': 0
                'segmentation': [], # Assuming the segmentation is empty
                'iscrowd': 0
            })

        if len(coco_images) == 100:  # Limiting the number of images to 100 
            break
    
    # Removing annotations which don't have a category_id in the categories list
    coco_annotations = [annotation for annotation in coco_annotations if annotation['category_id'] in [category['id'] for category in categories]]
    
    # Saving the predictions to COCO eval json file
    coco_output = {
        'categories': categories,  # Assuming 90 categories
        'images': coco_images,
        'annotations': coco_annotations,
    }

    # Creating directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    with open(output_file, 'w') as f:
        json.dump(coco_output, f, indent=4)

    return coco_output

# Example usage
predictions_json = 'predictions/SSD/predictions_2160p.json'
images_path = 'frames/2160p'

start = time.time()
pytorch_infer_images_from_directory(model, transform, images_path, predictions_json)
end = time.time()

# Saving the time taken to infer the images
with open('predictions/SSD/time_taken.txt', 'w') as f:
    f.write(f'Time taken to infer images: {end - start} seconds')

<h2>SSDLite</h2>

In [5]:
# Loading the retinanet model from pytorch hub
import torchvision
# from torchvision.models.detection.ssdlite import SSDLite
from torchvision.models.detection.ssdlite import SSDLite320_MobileNet_V3_Large_Weights

model = torchvision.models.detection.ssdlite320_mobilenet_v3_large(weights=SSDLite320_MobileNet_V3_Large_Weights.DEFAULT)
model.eval()

# model.to(device)

# Defining a transform to resize the image
transform = torchvision.transforms.Compose([
    # torchvision.transforms.Resize((800, 800)),
    torchvision.transforms.ToTensor()
])


# Creating a function to infer images from a directory and return the predictions in a single coco eval json file
def pytorch_infer_images_from_directory(model, transform_function, img_directory, output_file):
    """
        Function to infer images from a directory and return the predictions in a single coco eval json file

        Args:
            model: The model to be used for inference
            transform_function: The transform function to be applied to the images
            img_directory: The directory containing the images
            output_file: The output file where the predictions will be saved

        Returns:
            coco_output: The predictions in COCO eval json format
    """
    # Loading the images from the directory
    images = []
    for img in os.listdir(img_directory):
        images.append(img)

    coco_annotations = []
    coco_images = []

    for image_name in images:
        # Loading the image
        image = Image.open(os.path.join(img_directory, image_name))
        old_image = image.copy()

        # Applying the transform to the image
        image = transform_function(image)

        inputs = [image]

        # inputs = list(img.to(device) for img in image)

        # Getting the predictions
        with torch.no_grad():
            predictions = model(inputs)
            results = predictions[0]

        image_id = int(image_name.split('_')[-1].split('.')[0])

        # Removing predictions with score less than threshold
        threshold = 0.3
        results = {key: value[results['scores'] > threshold] for key, value in results.items()}

        # Appending the image info to the coco_images list
        coco_images.append({
            'file_name': image_name,
            'height': old_image.size[1],
            'width': old_image.size[0],
            'id': image_id
        })

        # Appending the predictions to the coco_annotations list
        for i in range(len(results['labels'])):
            coco_annotations.append({
                'id': len(coco_annotations) + 1,
                'image_id': image_id,
                'category_id': int(results['labels'][i]),
                'bbox': [float(results['boxes'][i][0]), float(results['boxes'][i][1]), float(results['boxes'][i][2] - results['boxes'][i][0]), float(results['boxes'][i][3] - results['boxes'][i][1])],
                'score': float(results['scores'][i]),
                'area': float((results['boxes'][i][2] - results['boxes'][i][0]) * (results['boxes'][i][3] - results['boxes'][i][1]).item()),  # Assuming the bbox format is [xmin, ymin, xmax, ymax]                'iscrowd': 0
                'segmentation': [], # Assuming the segmentation is empty
                'iscrowd': 0
            })

        if len(coco_images) == 100:  # Limiting the number of images to 100 
            break
    
    # Removing annotations which don't have a category_id in the categories list
    coco_annotations = [annotation for annotation in coco_annotations if annotation['category_id'] in [category['id'] for category in categories]]
    
    # Saving the predictions to COCO eval json file
    coco_output = {
        'categories': categories,  # Assuming 90 categories
        'images': coco_images,
        'annotations': coco_annotations,
    }

    # Creating directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    with open(output_file, 'w') as f:
        json.dump(coco_output, f, indent=4)

    return coco_output

# Example usage
predictions_json = 'predictions/SSDLite/predictions_2160p.json'
images_path = 'frames/2160p'

start = time.time()
pytorch_infer_images_from_directory(model, transform, images_path, predictions_json)
end = time.time()

# Saving the time taken to infer the images
with open('predictions/SSDLite/time_taken.txt', 'w') as f:
    f.write(f'Time taken to infer images: {end - start} seconds')