In [1]:
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import os
from pycocotools.coco import COCO
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset, DataLoader
import io
import base64
import numpy as np
import torch.backends.cudnn as cudnn
import json

# Nesse arquivo é feita a detecção de todas as imagens de treinamento.

# 1 - Load trained model

In [2]:
# Primeiro o modelo é inicializado
weights = FasterRCNN_ResNet50_FPN_Weights.COCO_V1
model = fasterrcnn_resnet50_fpn(weights=weights)

# Passa o modelo p/ GPU se disponível
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model is on {device}")

# Carrega os pesos
checkpoint = torch.load(r"best_model_faster_rcnn_25_epocas_treinadas_best_at_22.pth")
model.load_state_dict(checkpoint['model_state_dict'])

# Set the model to evaluation mode
model.eval()

Model is on cuda


  checkpoint = torch.load(r"best_model_faster_rcnn_25_epocas_treinadas_best_at_22.pth")


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

# 2 - Class
Usada para passar cada img pelo detector.

In [3]:
class PersonDetectionDataset(Dataset):
    def __init__(self, img_ids, annotations_file, image_dir, transform=None):
        """
        Args:
            img_ids (list): List of filtered image IDs (img_filtradas).
            annotations_file (str): Path to the COCO annotations JSON file.
            image_dir (str): Path to the directory containing images.
            transform (callable, optional): Optional transform to apply to each image.
        """
        self.img_ids = img_ids
        self.coco = COCO(annotations_file)
        self.image_dir = image_dir
        self.transform = transform
        
        # Preload image metadata for each image ID
        self.image_metadata = {
            image_id: self.coco.loadImgs(image_id)[0] for image_id in self.img_ids
        }

    def __len__(self):
        return len(self.img_ids)
    
    def __getitem__(self, idx):
        # Get the image ID and associated metadata
        image_id = self.img_ids[idx]
        img_info = self.image_metadata[image_id]
        image_path = os.path.join(self.image_dir, img_info['file_name'])
        
        # Load image
        image = Image.open(image_path).convert("RGB")
        
        # Get annotations for the image ID
        ann_ids = self.coco.getAnnIds(imgIds=image_id, catIds=[self.coco.getCatIds(catNms=['person'])[0]], iscrowd=False)
        anns = self.coco.loadAnns(ann_ids)
        
        # Extract bounding boxes and labels
        boxes = []
        labels = []
        for ann in anns:
            x, y, width, height = ann['bbox']
            boxes.append([x, y, x + width, y + height])  # Convert to [x_min, y_min, x_max, y_max]
            labels.append(1)  # Label "1" for "person"
        
        # Convert lists to tensors
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)
        
        # Create target dictionary with image metadata (e.g., width, height, and filename)
        target = {
            "boxes": boxes,
            "labels": labels,
            "image_id": torch.tensor([image_id]),
            "width": img_info["width"],
            "height": img_info["height"],
            "file_name": img_info["file_name"]  # Adding the file name here
        }
        
        # Apply transforms if any
        if self.transform:
            image = self.transform(image)
        
        return image, target

# 3 - Funções de filtragem
Mesmas dos arquivos do estimador, só uso o detector nas imagens filtradas.

In [4]:
def filtrar_imagens_com_somente_uma_pessoa(coco):
    """
    Filtra imagens que contêm somente uma pessoa usando a API COCO.

    Parameters:
    coco (COCO): Objeto COCO com as anotações carregadas.

    Returns:
    list: Lista de IDs das imagens que contêm somente uma pessoa.
    """
    image_ids = coco.getImgIds()
    image_with_one_person_only = []

    for image_id in image_ids:
        ann_ids = coco.getAnnIds(imgIds=image_id, catIds=[1], iscrowd=False)
        annotations = coco.loadAnns(ann_ids)
        if len(annotations) == 1:  # Somente uma pessoa na imagem
            image_with_one_person_only.append(image_id)

    print(f"Total de imagens que possuem uma única pessoa: {len(image_with_one_person_only)}")
    return image_with_one_person_only


def filtrar_imgs_com_zeros(coco, imgs_1_person):
    """
    Filtra imagens que possuem todos os keypoints iguais a zero.

    Parameters:
    coco (COCO): Objeto COCO com as anotações carregadas.
    imgs_1_person (list): Lista de IDs das imagens que contêm somente uma pessoa.

    Returns:
    list: Lista de IDs das imagens que contêm keypoints válidos.
    """
    all_zeros_ids = []

    for image_id in imgs_1_person:
        ann_ids = coco.getAnnIds(imgIds=image_id, catIds=[1], iscrowd=False)
        annotations = coco.loadAnns(ann_ids)
        keypoints = annotations[0]['keypoints']
        all_zeros = all(k == 0 for k in keypoints)
        if all_zeros:
            all_zeros_ids.append(image_id)

    filtered_ids = set(imgs_1_person) - set(all_zeros_ids)
    filtered_ids = list(filtered_ids)

    print(f'N. de imgs após remoção das vazias: {len(filtered_ids)}')

    return filtered_ids

def filtrar_imgs_k_pontos(imgs, k):
    """
    Filtra imagens com no mínimo k pontos vísiveis.
    """
    img_filtradas = []

    for img_id in imgs:        
        # Get annotations
        ann_ids = coco.getAnnIds(imgIds=img_id, catIds=[1], iscrowd=False)
        annotations = coco.loadAnns(ann_ids)
        
        # Get keypoints
        keypoints_raw = annotations[0]['keypoints']
        keypoints = []
        for i in range(0, len(keypoints_raw), 3):
            keypoints.append([keypoints_raw[i], keypoints_raw[i+1], keypoints_raw[i+2]])
        
        # Count number of visible points
        n_visible_keypoints = 0
        for point_set in keypoints:
            if point_set[2] == 2:
                n_visible_keypoints += 1
        
        if n_visible_keypoints >= k:
            img_filtradas.append(img_id)
        
    return img_filtradas

def get_paths_from_ids(img_ids, coco, path_imgs):
    img_paths = []
    for img_id in img_ids:
        img_info = coco.loadImgs(img_id)[0]
        img_path = os.path.join(path_imgs, img_info['file_name'])
        img_paths.append(img_path)

    return img_paths

# Treino local completo
# ann_file = r"C:\Users\Marlon\Downloads\COCO_Dataset\annotations\person_keypoints_train2017.json"
# coco = COCO(ann_file)
# path_imgs = r"C:\Users\Marlon\Downloads\COCO_Dataset\train2017"

# Kaggle
# ann_file = '/kaggle/input/coco-2017-dataset/coco2017/annotations/person_keypoints_train2017.json'
# coco = COCO(ann_file)
# path_imgs = '/kaggle/input/coco-2017-dataset/coco2017/train2017'

# Teste local menor
ann_file = r"C:\Users\Marlon\Downloads\COCO_Dataset\annotations\person_keypoints_val2017.json"
coco = COCO(ann_file)
path_imgs = r"C:\Users\Marlon\Downloads\COCO_Dataset\val2017"

filter_images_1p = filtrar_imagens_com_somente_uma_pessoa(coco)
filter_no_zero_imgs = filtrar_imgs_com_zeros(coco, filter_images_1p)
img_filtradas = filter_no_zero_imgs

# Número mínimo de pontos visíveis
k = 7
img_filtradas = filtrar_imgs_k_pontos(img_filtradas, k)
print(f"Número final de imagens com n mínimo {k} pontos: {len(img_filtradas)}")

path_img_filtradas = get_paths_from_ids(img_filtradas, coco, path_imgs)

loading annotations into memory...
Done (t=0.29s)
creating index...
index created!
Total de imagens que possuem uma única pessoa: 1045
N. de imgs após remoção das vazias: 921
Número final de imagens com n mínimo 7 pontos: 721


# 4 - Load data

In [5]:
print(img_filtradas)

[393226, 532493, 45070, 401446, 213033, 471087, 475191, 561223, 356427, 32861, 192607, 442463, 311394, 561256, 221291, 49259, 94326, 417911, 118921, 270474, 356505, 266409, 90284, 188592, 229553, 356531, 213171, 549055, 110784, 389316, 65736, 344268, 16598, 516318, 295138, 33005, 545007, 458992, 135410, 426241, 213255, 250127, 426268, 4395, 205105, 61747, 463174, 336209, 414034, 483667, 8532, 270677, 393569, 434548, 442746, 426376, 336265, 369037, 459153, 102805, 94614, 110999, 389532, 254368, 106912, 102820, 381360, 135604, 262587, 278973, 238013, 192964, 340451, 451043, 205289, 258541, 455157, 352760, 29187, 61960, 25096, 451084, 270883, 365095, 127530, 389684, 295478, 152120, 270908, 324158, 373315, 172617, 451155, 107094, 123480, 33368, 438876, 565853, 504415, 172649, 406129, 275058, 569972, 565877, 311928, 520832, 193162, 25228, 328337, 184978, 152214, 4765, 369310, 246436, 127660, 414385, 389812, 402118, 565962, 471756, 520910, 99024, 479953, 74457, 488166, 262895, 250619, 492284

In [10]:
transform = ToTensor()
# Create the dataset
dataset = PersonDetectionDataset(img_ids=img_filtradas, annotations_file=ann_file, image_dir=path_imgs, transform=transform)

# Create the dataloader
# data_loader = DataLoader(dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))
data_loader = DataLoader(
    dataset,
    batch_size=2,  # Increase if GPU memory allows
    shuffle=False,
    pin_memory=True,  # Disable pinned memory for now  
    collate_fn=lambda x: tuple(zip(*x))  # Necessary for variable-sized detection data
)

loading annotations into memory...
Done (t=0.28s)
creating index...
index created!


# 5 - Função auxiliar: Merge bbox

Algumas detecções são duplicadas, uma bounding box está dentro da outra. E na grande maioria das vezes existe uma boa sobreposição entre as bbox, por isso estou só as fundindo. Avaliando o resultadoo eles pareceram satisfatórios.

In [11]:
def custom_merge_boxes(detections, score_threshold=0.5, containment_threshold=0.7):
    """
    Custom function to merge bounding boxes if one is mostly contained within another.
    """
    device = detections['boxes'].device  # Ensure device compatibility

    # Filter for "person" boxes above the score threshold
    boxes = []
    scores = []
    for i in range(len(detections['labels'])):
        if detections['labels'][i] == 1 and detections['scores'][i] > score_threshold:
            boxes.append(detections['boxes'][i])
            scores.append(detections['scores'][i])

    if len(boxes) == 0:
        return {"boxes": [], "scores": []}  # No boxes to keep

    boxes = torch.stack(boxes).to(device)
    scores = torch.tensor(scores, device=device)

    # Initialize merged boxes and scores
    merged_boxes = []
    used = set()

    for i in range(len(boxes)):
        if i in used:
            continue

        x_min, y_min, x_max, y_max = boxes[i]
        score = scores[i]
        
        for j in range(i + 1, len(boxes)):
            if j in used:
                continue
            
            # Calculate IoU to check overlap
            iou = calculate_iou(boxes[i], boxes[j])

            # If IoU is high (boxes significantly overlap), merge them
            if iou > containment_threshold or is_contained(boxes[i], boxes[j], containment_threshold):
                # Expand the outer box to include both
                x_min = min(x_min, boxes[j][0])
                y_min = min(y_min, boxes[j][1])
                x_max = max(x_max, boxes[j][2])
                y_max = max(y_max, boxes[j][3])
                used.add(j)  # Mark box j as used

        # Append the merged (or standalone) box
        merged_boxes.append([x_min, y_min, x_max, y_max])

    # Convert merged_boxes list to tensor
    merged_boxes = torch.tensor(merged_boxes, dtype=torch.float32)

    # Return merged boxes and corresponding scores
    return {"boxes": merged_boxes, "scores": torch.ones(len(merged_boxes), device=device)}


def calculate_iou(box1, box2):
    """ Calculate Intersection-over-Union (IoU) between two bounding boxes. """
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    intersection_area = max(0, x2 - x1) * max(0, y2 - y1)

    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])

    union_area = box1_area + box2_area - intersection_area

    return intersection_area / union_area if union_area != 0 else 0


def is_contained(box1, box2, threshold=0.9):
    """ Checks if box2 is mostly contained within box1. """
    # Calculate the area of each box
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    intersection_area = max(0, x2 - x1) * max(0, y2 - y1)
    if box1_area > 0 and (intersection_area / box1_area) > threshold:
        return True
    return False


# Plot detections function

In [12]:
def plot_detections(image, targets, detections, threshold=0.5):
    fig, ax = plt.subplots(1, figsize=(12, 8))
    ax.imshow(image)
    
    # Plot ground truth bounding boxes in green
    for box in targets['boxes']:
        x_min, y_min, x_max, y_max = box.cpu()  # Move to CPU if necessary
        width, height = x_max - x_min, y_max - y_min
        rect = patches.Rectangle((x_min, y_min), width, height, linewidth=2, edgecolor='green', facecolor='none')
        ax.add_patch(rect)

    # Plot detected bounding boxes in red with confidence scores
    for i, box in enumerate(detections['boxes']):
        score = detections['scores'][i].cpu().item()  # Move to CPU and get the scalar value
        
        # Only plot if the score is above the threshold
        if score >= threshold:
            x_min, y_min, x_max, y_max = box.cpu()  # Move box coordinates to CPU
            width, height = x_max - x_min, y_max - y_min
            rect = patches.Rectangle((x_min, y_min), width, height, linewidth=2, edgecolor='red', facecolor='none')
            ax.add_patch(rect)
            ax.text(x_min, y_min - 5, f"{score:.2f}", color="red", fontsize=12, fontweight="bold")
    
    plt.axis("off")
    plt.show()

# 6 - Detecções geradas pelo modelo

In [13]:
# # Run the model on the test images
with torch.no_grad():
    valid_images_tuned = []  # To keep track of images with a single person detection
    total_images = len(data_loader.dataset)  # Get total number of images for progress tracking
    processed_images = 0  # Counter to track progress
    
    for batch_idx, (images, targets) in enumerate(data_loader):
        print(f"Processing batch {batch_idx + 1}/{len(data_loader)}...")
        
        # Load images to GPU and track progress
        images = [image.to(device) for image in images]
        processed_images += len(images)  # Update progress counter

        # Run the model and time it
        print(f"Running model on batch {batch_idx + 1}...")
        outputs = model(images)
        print(f"Model run complete for batch {batch_idx + 1}")

        for i, output in enumerate(outputs):
            # Use custom merging for person detections
            merged_output = custom_merge_boxes(output, score_threshold=0.25, containment_threshold=0.25)
            
            # Check if only one bounding box remains
            if len(merged_output['boxes']) == 1:
                image_id = targets[i]['image_id'].item()  # Assumes `image_id` is in `targets`
                
                # Track valid detections
                valid_images_tuned.append((targets[i], merged_output, image_id))
                # Experimentar remover esse tensor da lista, suspeito que ele esteja ocupando muita memória
                # valid_images_tuned.append((images[i], targets[i], merged_output, image_id))
        
        # Print progress after each batch
        print(f"Processed {processed_images}/{total_images} images")

        # Optional: Stop after a few batches for testing
        # break


Processing batch 1/361...
Running model on batch 1...
Model run complete for batch 1
Processed 2/721 images
Processing batch 2/361...
Running model on batch 2...
Model run complete for batch 2
Processed 4/721 images
Processing batch 3/361...
Running model on batch 3...
Model run complete for batch 3
Processed 6/721 images
Processing batch 4/361...
Running model on batch 4...
Model run complete for batch 4
Processed 8/721 images
Processing batch 5/361...
Running model on batch 5...


KeyboardInterrupt: 

# 7 - Salvar detecções originais
Salvei em um arquivo .json.

**Importante não salvar o tensor image image o arquivo fica extremamente grande.**

In [None]:
import json

# Convert tensors to lists for JSON compatibility
valid_images_detection_results = [
    {
        "target": {k: (v.cpu().numpy().tolist() if isinstance(v, torch.Tensor) else v) for k, v in target.items()},
        "merged_output": {
            "boxes": merged_output["boxes"].cpu().numpy().tolist(),
            "scores": merged_output["scores"].cpu().numpy().tolist() if "scores" in merged_output else []
        },
        "image_id": image_id
    }
    for target, merged_output, image_id in valid_images_tuned
]

# Save as JSON
with open("deteccoes_originais.json", "w") as f:
    json.dump(valid_images_detection_results, f)

# 8 Adicionar padding nessas detecções

Carregando detecções originais.

In [None]:
# Para os dados de treino
with open("deteccoes_originais.json", "r") as f:
    valid_images_detections = json.load(f)

Disso, acho que só preciso extrair:
- Bounding box gerada
- Width
- Heigth
- Image ID

## 8.1 - Função para extrair isso de cada uma entrada válida

In [14]:
def extrair_info(entrada_valida):
    bbox_gerada = entrada_valida['merged_output']['boxes'][0]
    bbox_gerada = bbox_gerada.cpu().numpy().tolist() if isinstance(bbox_gerada, torch.Tensor) else bbox_gerada # Checar se é um tensor
    width, height = entrada_valida['target']['width'], entrada_valida['target']['height']
    image_id = entrada_valida['image_id']
    
    return bbox_gerada, width, height, image_id

Verificando.

In [15]:
bbox, width, height, image_id = extrair_info(valid_images_detections[0])
print(bbox, width, height, image_id)


[94.26876068115234, 191.50048828125, 157.19580078125, 337.08837890625] 640 480 393226


**Recaptulando**

- Agora tenho as detecções válidas em um arquivo .json.
- Vou mostrar o número de detecções válidas/inválidas.
- Depois adicionar o padding a essas detecções

In [16]:
print(f"Detecções válidas/totais: {len(valid_images_detections)}/{len(data_loader.dataset)}")

Detecções válidas/totais: 507/721


## Função Add padding

In [17]:
def add_percentage_padding_to_box(box, padding_percentage, image_width, image_height):
    """
    Adds padding to a bounding box as a percentage of its dimensions, keeping it within image bounds.
    
    Args:
        box: Bounding box coordinates as [x_min, y_min, x_max, y_max].
        padding_percentage: Padding as a percentage of the box dimensions (e.g., 15 for 15%).
        image_width: Width of the image.
        image_height: Height of the image.
    
    Returns:
        Padded bounding box coordinates.
    """
    x_min, y_min, x_max, y_max = box
    
    # Calculate the width and height of the bounding box
    box_width = x_max - x_min
    box_height = y_max - y_min
    
    # Calculate padding based on the percentage
    x_padding = box_width * (padding_percentage / 100)
    y_padding = box_height * (padding_percentage / 100)
    
    # Apply padding and ensure the box stays within image bounds
    x_min = max(0, x_min - x_padding)
    y_min = max(0, y_min - y_padding)
    x_max = min(image_width, x_max + x_padding)
    y_max = min(image_height, y_max + y_padding)
    
    return [x_min, y_min, x_max, y_max]

## Plot padding

In [19]:
def plot_detections(image_path, original_bbox, new_bbox):
    """
    Plots an image with the original and new (padded) bounding boxes.
    
    Args:
        image_path (str): Full path to the image.
        original_bbox (list): List with the original bounding box [x_min, y_min, x_max, y_max].
        new_bbox (list): List with the new bounding box (e.g., padded) [x_min, y_min, x_max, y_max].
    """
    # Load image
    image = Image.open(image_path).convert("RGB")
    image_np = np.array(image)  # Convert to numpy array for plotting

    # Create plot
    fig, ax = plt.subplots(1)
    ax.imshow(image_np)

    # Plot original bounding box in green
    x_min, y_min, x_max, y_max = original_bbox
    width, height = x_max - x_min, y_max - y_min
    rect = patches.Rectangle((x_min, y_min), width, height, linewidth=2, edgecolor='green', facecolor='none')
    ax.add_patch(rect)

    # Plot new bounding box in red
    x_min, y_min, x_max, y_max = new_bbox
    width, height = x_max - x_min, y_max - y_min
    rect = patches.Rectangle((x_min, y_min), width, height, linewidth=2, edgecolor='red', facecolor='none')
    ax.add_patch(rect)

    plt.axis("off")
    plt.show()

## 8.2 - Aplicar padding nas detecções originais
- No final vamos ter um objeto que armazena:
  - ID usada
  - Bounding box de corte

In [None]:
detecoes_transformadas_com_ids = []
padding_percentage = 10

for entrada in valid_images_detections:
    bbox, width, height, image_id = extrair_info(entrada)
    new_bbox = add_percentage_padding_to_box(bbox, padding_percentage, width, height)
    detecoes_transformadas_com_ids.append((image_id, new_bbox))
    

## 8.3 - Salvar em json os resultados finais

In [None]:
with open("deteccoes_finais.json", "w") as f:
    json.dump(detecoes_transformadas_com_ids, f)