# Object detection with Faster R-CNN

## Import Libraries

In [4]:
import torchvision
from torchvision import  transforms 
import torch
from torch import no_grad
import requests
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

## Auxiliary Functions

This function will assign a string name to a predicted class and eliminate predictions whose likelihood is under a threshold.

In [7]:
def get_predictions(pred,threshold=0.8,objects=None ):
    print(pred)
    """
    This function will assign a string name to a predicted class and eliminate predictions whose likelihood  is under a threshold 

    Parameters:
    - pred: A list containing a dictionary with predictions. The dictionary includes:
        - 'labels': Tensor of predicted class indices.
        - 'scores': Tensor of confidence scores for each prediction.
        - 'boxes': Tensor of bounding box coordinates for each prediction.
    - threshold: Confidence threshold to filter predictions. Only predictions with confidence above this threshold are kept.
    - objects: Optional list of specific object classes to keep. If provided, only predictions of these classes are returned.
    Returns:
    - predicted_classes: A list of tuples, each containing:
        - Class name (string)
        - Confidence score (float)
        - Bounding box coordinates (tuple of two points: (x1, y1), (x2, y2))
    """

    # Convert the predictions into a more readable format:
    # - COCO_INSTANCE_CATEGORY_NAMES[i]: Converts class index to a human-readable class name (e.g., 'person', 'car').
    # - p: Confidence score of the prediction.
    # - box: Bounding box coordinates of the detected object.
    predicted_classes= [
        (COCO_INSTANCE_CATEGORY_NAMES[i],p,[(box[0], box[1]), (box[2], box[3])]) 
        for i,p,box in zip(
            list(pred[0]['labels'].numpy()),   # convert class indices to numpy list
            pred[0]['scores'].detach().numpy(), # convert class scores to numpy list
            list(pred[0]['boxes'].detach().numpy()))] # convert class boxes (x1,x2, y1,y2) to numpy list
    predicted_classes=[  stuff  for stuff in predicted_classes  if stuff[1]>threshold ]
    
    if objects  and predicted_classes :
        predicted_classes=[ (name, p, box) for name, p, box in predicted_classes if name in  objects ]
    return predicted_classes

## Draw Box arround each Box

In [9]:
def draw_box(predicted_classes,image,rect_th= 10,text_size= 3,text_th=3):
    """
    draws box around each object 
    """
    # Convert the image tensor to a NumPy array and adjust its format for OpenCV:
    # 1. Transpose the image dimensions from (C, H, W) to (H, W, C).
    # 2. Clip values to ensure they are within the range [0, 1].
    # 3. Convert the image from RGB to BGR format (OpenCV uses BGR by default).
    # 4. Scale the pixel values to the range [0, 255] and convert to uint8.
    img=(np.clip(cv2.cvtColor(np.clip(image.numpy().transpose((1, 2, 0)),0,1), cv2.COLOR_RGB2BGR),0,1)*255).astype(np.uint8).copy()


    # Iterate over each predicted object in the list
    for predicted_class in predicted_classes:
   
        label=predicted_class[0]
        probability=predicted_class[1]
        box=predicted_class[2]

        cv2.rectangle(img, box[0], box[1],(0, 255, 0), rect_th) # Draw Rectangle with the coordinates
        cv2.putText(img,label, box[0],  cv2.FONT_HERSHEY_SIMPLEX, text_size, (0,255,0),thickness=text_th) 
        cv2.putText(img,label+": "+str(round(probability,2)), box[0],  cv2.FONT_HERSHEY_SIMPLEX, text_size, (0,255,0),thickness=text_th)
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    del(img)
    del(image)

#### Free up some memory

In [11]:
def save_RAM(image_=False):
    global image, img, pred
    torch.cuda.empty_cache()
    del(img)
    del(pred)
    if image_:
        image.close()
        del(image)

## Load Pre-trained Faster R-CNN

<a href='https://arxiv.org/abs/1506.01497'>Faster R-CNN</a> is a model that predicts both bounding boxes and class scores for potential objects in the image  pre-trained on <a href="https://cocodataset.org/">COCO<a>. 


In [14]:
# Load a pre-trained Faster R-CNN model with a ResNet-50 backbone and Feature Pyramid Network (FPN)
# The model is pre-trained on the COCO dataset, which includes 80 object classes.
model_ = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
# Set the model to evaluation mode
# This is important because some layers (e.g., dropout, batch normalization) behave differently
# during training and evaluation. Setting the model to evaluation mode ensures consistent behavior.
model_.eval()
# Freeze all the parameters in the model to prevent them from being updated during training
# This is useful when using the model for inference or fine-tuning only specific layers.
for name, param in model_.named_parameters():
    param.requires_grad = False # Disable gradient computation for this parameter
print("done")



done


In [15]:
def model(x):
    # Use torch.no_grad() to disable gradient computation
    # This is important because:
    # 1. It reduces memory usage by not storing intermediate values for backpropagation.
    # 2. It speeds up computation since gradients are not calculated.
    # 3. It ensures that the model's parameters are not updated during inference.
    with torch.no_grad():
        yhat= model_(x)
    return yhat

## the classes used

In [17]:
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]
len(COCO_INSTANCE_CATEGORY_NAMES)

91

## Object localization

In [None]:
img_path = "test_images/desk.jpeg"
half = 0.5

image = Image.open(img_path)

image.resize(
    [int(half * s) for s in image.size]
)
plt.imshow(image)
plt.show()