In [1]:
import os 
import supervision as sv
from transformers import DetrForObjectDetection, DetrImageProcessor
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import DataLoader
from PIL import Image
import time
import torchvision
from torchvision.ops import box_iou
import torch
import pytorch_lightning
import cv2
import random
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
image_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-101")

In [3]:
dataset = r'D:\Data Science\DocScan-Research\Inference\DETR DATA'
ANNOTATION_FILE_NAME = r"result.json"
TRAIN_DIRECTORY = os.path.join(dataset, r"train")
VAL_DIRECTORY = os.path.join(dataset, r"val")
TEST_DIRECTORY = os.path.join(dataset, r"test")

class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(
        self,
        image_directory_path: str,
        image_processor,
        train: bool = True
    ):
        annotation_file_path = os.path.join(image_directory_path, ANNOTATION_FILE_NAME)
        super(CocoDetection, self).__init__(image_directory_path, annotation_file_path)
        self.image_processor = image_processor

    def __getitem__(self, idx):
        images, annotations = super(CocoDetection, self).__getitem__(idx)
        image_id = self.ids[idx]
        annotations = {'image_id': image_id, 'annotations': annotations}
        encoding = self.image_processor(images=images, annotations=annotations, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze()
        target = encoding["labels"][0]

        return pixel_values, target
    
TRAIN_DATASET = CocoDetection(
    image_directory_path=TRAIN_DIRECTORY,
    image_processor=image_processor,
    train=True)
VAL_DATASET = CocoDetection(
    image_directory_path=VAL_DIRECTORY,
    image_processor=image_processor,
    train=False)
TEST_DATASET = CocoDetection(
    image_directory_path=TEST_DIRECTORY,
    image_processor=image_processor,
    train=False)

print("Number of training examples:", len(TRAIN_DATASET))
print("Number of validation examples:", len(VAL_DATASET))
print("Number of test examples:", len(TEST_DATASET))

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Number of training examples: 145
Number of validation examples: 32
Number of test examples: 19


In [4]:
CHECKPOINT = "facebook/detr-resnet-50"

# Best Performing Model
MODEL_PATH = "D:\Data Science\DocScan-Research\Inference\DETR 9"

# Doesnt Work
# MODEL_101 = 'facebook/detr-resnet-101'
# CHECKPOINT_101 = 'facebook/detr-resnet-101'

# Older Model
# MODEL_PATH = "models/DETR-run4"


## Load Model
def loadModel(MODEL_PATH, CHECKPOINT):
    model = DetrForObjectDetection.from_pretrained(MODEL_PATH)
    image_processor = DetrImageProcessor.from_pretrained(CHECKPOINT)
    return model, image_processor

In [5]:
from transformers import DetrForObjectDetection
import torch
from collections import OrderedDict

# Initialize the model architecture
model, image_processor = loadModel(MODEL_PATH=MODEL_PATH, CHECKPOINT=CHECKPOINT)

In [6]:
checkpoint = torch.load("D:\Data Science\DocScan-Research\Inference\DETR 9\detr-epoch=49-val_loss=1.01.ckpt", map_location='cpu')

# # Get the state dict
state_dict = checkpoint['state_dict']

# # # Remove the 'model.model.' prefix from the state dict keys
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    name = k.replace("model.model.", "")
    new_state_dict[name] = v

# # # Load the modified state dict
model.load_state_dict(new_state_dict, strict=False)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print("Model loaded successfully!")

Model loaded successfully!


In [7]:
import json
from collections import defaultdict
 
def create_ground_truth_dict(json_file_path):
    # Read the JSON file
    with open(json_file_path, 'r') as f:
        data = json.load(f)
   
    # Create a mapping of image_id to file_name
    image_id_to_filename = {img['id']: img['file_name'] for img in data['images']}
   
    # Use defaultdict to automatically initialize empty dictionaries for new keys
    ground_truth = defaultdict(lambda: {'boxes': [], 'labels': []})
   
    # Process annotations
    for annotation in data['annotations']:
        image_id = annotation['image_id']
        filename = image_id_to_filename[image_id]
       
        # Extract bounding box coordinates
        x, y, width, height = annotation['bbox']
        box = [x, y, x + width, y + height]
       
        # Add box and label to the ground_truth dictionary
        ground_truth[filename]['boxes'].append(box)
        ground_truth[filename]['labels'].append(annotation['category_id'])
   
    # Convert defaultdict back to regular dict for final output
    return dict(ground_truth)


In [8]:
json_file_path = r'D:\Data Science\DocScan-Research\Inference\DETR DATA\test\result.json'
ground_truth = create_ground_truth_dict(json_file_path)

In [9]:
ground_truth

{'images/3c56fa90-drawing_7.png': {'boxes': [[109.7142857142857,
    133.71428571428572,
    154.28571428571428,
    176.57142857142856],
   [536.5714285714284,
    363.42857142857144,
    632.5714285714284,
    407.99999999999994],
   [214.28571428571425,
    517.7142857142858,
    289.71428571428567,
    548.5714285714287]],
  'labels': [3, 1, 2]},
 'images/2e4aefb0-drawing_15.png': {'boxes': [[404.57142857142856,
    65.14285714285714,
    452.57142857142856,
    104.57142857142857],
   [608.5714285714284,
    366.85714285714283,
    702.8571428571428,
    407.9999999999999],
   [44.57142857142857,
    533.1428571428572,
    130.28571428571428,
    557.1428571428572],
   [217.7142857142857,
    143.99999999999997,
    293.1428571428571,
    162.85714285714283],
   [385.71428571428567,
    471.4285714285715,
    442.2857142857143,
    500.57142857142856],
   [123.42857142857139,
    452.5714285714286,
    212.5714285714285,
    478.28571428571433]],
  'labels': [3, 1, 0, 2, 2, 2]},
 

In [10]:
categories = TEST_DATASET.coco.cats
id2label = {k: v['name'] for k,v in categories.items()}
id2label

{0: 'bar-scale', 1: 'color-stamp', 2: 'detail-labels', 3: 'north-sign'}

In [11]:
print(id2label[0])

bar-scale


In [12]:
def add_missing_label(image, save_path, labels):
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()
    text = f"Missing labels: {', '.join(map(str, labels))}"
    position = (10, 10)
    draw.text(position, text, fill="red", font=font)
    os.makedirs(os.path.dirname(save_path), exist_ok=True)  # Ensure the directory exists
    image.save(save_path)

In [13]:
from PIL import Image, ImageDraw, ImageFont
import os
import cv2
import torch

IMAGE_FOLDER = r'D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images'
CONFIDENCE_THRESHOLD = 0.6
IOU_THRESHOLD = 0.7

def inference(image_folder, CONFIDENCE_THRESHOLD, IOU_THRESHOLD):
    results_dict = {}
    
    for img in os.listdir(image_folder):
        IMAGE_PATH = os.path.join(image_folder, img)
        print(f"Processing {IMAGE_PATH}")

        image = cv2.imread(IMAGE_PATH)
        inputs = image_processor(images=image, return_tensors='pt')

        # Move inputs to the same device as the model
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        # Get ground truth for this image
        target = ground_truth.get(img, {'boxes': torch.empty((0, 4)), 'labels': torch.empty((0,), dtype=torch.long)})
        target = {k: v for k, v in target.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            
            # Post-process
            target_sizes = torch.tensor([image.shape[:2]]).to(model.device)
            results = image_processor.post_process_object_detection(
                outputs=outputs,
                threshold=CONFIDENCE_THRESHOLD,
                target_sizes=target_sizes
            )[0]
        
        detections = sv.Detections.from_transformers(transformers_results=results)
        labels = [f"{id2label[class_id]} {confidence:.2f}" for _, confidence, class_id, _ in detections]
        print(set(detections.class_id)) 
        
        box_annotator = sv.BoxAnnotator()
        frame = box_annotator.annotate(scene=image, detections=detections, labels=labels)
        
        image = Image.fromarray(frame)
        image_path = f"Temp/results/annotated_{img}"
        all_labels = {0, 1, 2, 3}
        label = all_labels - set(detections.class_id)
        add_missing_label(image, image_path, label) # type: ignore
        results_dict[IMAGE_PATH.replace('Temp/', '')] = results
    return results_dict

results = inference(IMAGE_FOLDER, CONFIDENCE_THRESHOLD, IOU_THRESHOLD)

Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\0f7db672-drawing_88.png
{2, 3}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\1eecb4f3-drawing_25.png
{0, 1, 2, 3}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\2832a63f-drawing_44.png
{0, 1, 2, 3}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\2e4aefb0-drawing_15.png
{0, 1, 2}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\3c56fa90-drawing_7.png
{1, 2, 3}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\594b3b57-drawing_113.png
{0, 2}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\5b5d1b9b-drawing_123.png
{2, 3}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\72719062-drawing_43.png
{0, 1, 2}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\777ab86f-drawing_22.png
{1, 2, 3}
Processing D

In [14]:
results = inference(IMAGE_FOLDER, 0.5, 0.6)

Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\0f7db672-drawing_88.png
{2, 3}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\1eecb4f3-drawing_25.png
{0, 1, 2, 3}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\2832a63f-drawing_44.png
{0, 1, 2, 3}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\2e4aefb0-drawing_15.png
{0, 1, 2, 3}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\3c56fa90-drawing_7.png
{1, 2, 3}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\594b3b57-drawing_113.png
{0, 2, 3}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\5b5d1b9b-drawing_123.png
{2, 3}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\72719062-drawing_43.png
{0, 1, 2, 3}
Processing D:\Data Science\DocScan-Research\Inference\DETR DATA\test\images\777ab86f-drawing_22.png
{1, 2, 3}
Pro

In [15]:
results

{'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\0f7db672-drawing_88.png': {'scores': tensor([0.9013, 0.8609, 0.6256, 0.6988]),
  'labels': tensor([2, 3, 3, 2]),
  'boxes': tensor([[624.6639, 264.1835, 713.7153, 295.3075],
          [ 45.2162,  70.5122,  86.6006, 107.0394],
          [ 36.6824, 351.7408,  79.7447, 390.5545],
          [215.7100, 516.6870, 280.1140, 544.6638]])},
 'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\1eecb4f3-drawing_25.png': {'scores': tensor([0.8693, 0.8655, 0.8046, 0.8504, 0.8804, 0.9235, 0.8288, 0.9090, 0.8485]),
  'labels': tensor([2, 3, 1, 0, 2, 2, 2, 2, 2]),
  'boxes': tensor([[668.8936, 238.1275, 722.2538, 258.6493],
          [ 39.1284,  66.4541,  83.1953, 102.4941],
          [441.2927, 526.8177, 537.1543, 561.7869],
          [ 38.6701, 524.7820, 154.8171, 548.7828],
          [109.5296, 294.8550, 181.0603, 314.5777],
          [277.1948, 278.4435, 318.4635, 293.4428],
          [ 63.5850, 379.7266,

In [16]:
type(results)

dict

In [17]:
type(ground_truth)

dict

In [18]:
print(ground_truth)

{'images/3c56fa90-drawing_7.png': {'boxes': [[109.7142857142857, 133.71428571428572, 154.28571428571428, 176.57142857142856], [536.5714285714284, 363.42857142857144, 632.5714285714284, 407.99999999999994], [214.28571428571425, 517.7142857142858, 289.71428571428567, 548.5714285714287]], 'labels': [3, 1, 2]}, 'images/2e4aefb0-drawing_15.png': {'boxes': [[404.57142857142856, 65.14285714285714, 452.57142857142856, 104.57142857142857], [608.5714285714284, 366.85714285714283, 702.8571428571428, 407.9999999999999], [44.57142857142857, 533.1428571428572, 130.28571428571428, 557.1428571428572], [217.7142857142857, 143.99999999999997, 293.1428571428571, 162.85714285714283], [385.71428571428567, 471.4285714285715, 442.2857142857143, 500.57142857142856], [123.42857142857139, 452.5714285714286, 212.5714285714285, 478.28571428571433]], 'labels': [3, 1, 0, 2, 2, 2]}, 'images/777ab86f-drawing_22.png': {'boxes': [[666.8571428571428, 77.14285714285714, 711.4285714285714, 118.28571428571429], [545.142857

In [19]:
print(results)

{'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\0f7db672-drawing_88.png': {'scores': tensor([0.9013, 0.8609, 0.6256, 0.6988]), 'labels': tensor([2, 3, 3, 2]), 'boxes': tensor([[624.6639, 264.1835, 713.7153, 295.3075],
        [ 45.2162,  70.5122,  86.6006, 107.0394],
        [ 36.6824, 351.7408,  79.7447, 390.5545],
        [215.7100, 516.6870, 280.1140, 544.6638]])}, 'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\1eecb4f3-drawing_25.png': {'scores': tensor([0.8693, 0.8655, 0.8046, 0.8504, 0.8804, 0.9235, 0.8288, 0.9090, 0.8485]), 'labels': tensor([2, 3, 1, 0, 2, 2, 2, 2, 2]), 'boxes': tensor([[668.8936, 238.1275, 722.2538, 258.6493],
        [ 39.1284,  66.4541,  83.1953, 102.4941],
        [441.2927, 526.8177, 537.1543, 561.7869],
        [ 38.6701, 524.7820, 154.8171, 548.7828],
        [109.5296, 294.8550, 181.0603, 314.5777],
        [277.1948, 278.4435, 318.4635, 293.4428],
        [ 63.5850, 379.7266, 117.5556, 397.8738],
     

In [20]:
print(len(ground_truth))
print(type(ground_truth))
print(len(results))
print(type(results))


19
<class 'dict'>
19
<class 'dict'>


In [21]:

def calculate_iou(box1, box2):
    """
    Calculate the Intersection over Union (IoU) of two bounding boxes.
    
    Parameters:
    - box1: (x1, y1, x2, y2) coordinates of the first bounding box
    - box2: (x1, y1, x2, y2) coordinates of the second bounding box
    
    Returns:
    - iou: Intersection over Union (IoU) value
    """
    
    # Unpack the coordinates of the two boxes
    x1_1, y1_1, x2_1, y2_1 = box1
    x1_2, y1_2, x2_2, y2_2 = box2
    
    # Calculate the (x, y) coordinates of the intersection rectangle
    xi1 = max(x1_1, x1_2)
    yi1 = max(y1_1, y1_2)
    xi2 = min(x2_1, x2_2)
    yi2 = min(y2_1, y2_2)
    
    # Calculate the area of the intersection rectangle
    inter_width = max(0, xi2 - xi1)
    inter_height = max(0, yi2 - yi1)
    inter_area = inter_width * inter_height
    
    # Calculate the area of both bounding boxes
    box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
    box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
    
    # Calculate the union area
    union_area = box1_area + box2_area - inter_area
    
    # Calculate the IoU
    iou = inter_area / union_area if union_area != 0 else 0
    
    return iou

In [22]:
ground_truth.keys()

dict_keys(['images/3c56fa90-drawing_7.png', 'images/2e4aefb0-drawing_15.png', 'images/777ab86f-drawing_22.png', 'images/f9310c0d-drawing_24.png', 'images/1eecb4f3-drawing_25.png', 'images/817c3fb2-drawing_42.png', 'images/72719062-drawing_43.png', 'images/2832a63f-drawing_44.png', 'images/78e13516-drawing_45.png', 'images/abc5c537-drawing_48.png', 'images/8e605e02-drawing_56.png', 'images/a021edac-drawing_64.png', 'images/f15ca727-drawing_67.png', 'images/a9720cda-drawing_76.png', 'images/0f7db672-drawing_88.png', 'images/594b3b57-drawing_113.png', 'images/5b5d1b9b-drawing_123.png', 'images/f12144f6-drawing_168.png', 'images/81c64eea-drawing_106.png'])

In [23]:
print(list(ground_truth.keys())[0])

images/3c56fa90-drawing_7.png


In [24]:
ground_truth['images/2e4aefb0-drawing_15.png']['labels']

[3, 1, 0, 2, 2, 2]

In [25]:
ground_truth['images/2e4aefb0-drawing_15.png']

{'boxes': [[404.57142857142856,
   65.14285714285714,
   452.57142857142856,
   104.57142857142857],
  [608.5714285714284,
   366.85714285714283,
   702.8571428571428,
   407.9999999999999],
  [44.57142857142857,
   533.1428571428572,
   130.28571428571428,
   557.1428571428572],
  [217.7142857142857,
   143.99999999999997,
   293.1428571428571,
   162.85714285714283],
  [385.71428571428567,
   471.4285714285715,
   442.2857142857143,
   500.57142857142856],
  [123.42857142857139,
   452.5714285714286,
   212.5714285714285,
   478.28571428571433]],
 'labels': [3, 1, 0, 2, 2, 2]}

In [26]:
print(list(results.keys()))

['D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\0f7db672-drawing_88.png', 'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\1eecb4f3-drawing_25.png', 'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\2832a63f-drawing_44.png', 'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\2e4aefb0-drawing_15.png', 'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\3c56fa90-drawing_7.png', 'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\594b3b57-drawing_113.png', 'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\5b5d1b9b-drawing_123.png', 'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\72719062-drawing_43.png', 'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\777ab86f-drawing_22.png', 'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\78e13516-drawing_45.png', 'D:\\Dat

In [27]:
results['D:\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\2e4aefb0-drawing_15.png']['labels']

tensor([1, 2, 0, 2, 3])

In [28]:
results['D:\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\2e4aefb0-drawing_15.png']

{'scores': tensor([0.7578, 0.7583, 0.7988, 0.9328, 0.5799]),
 'labels': tensor([1, 2, 0, 2, 3]),
 'boxes': tensor([[608.8925, 369.3228, 698.7352, 404.4241],
         [383.8599, 471.5917, 445.0086, 499.6260],
         [ 40.2911, 527.2289, 125.3356, 554.7177],
         [215.5658, 142.9829, 295.6338, 167.3094],
         [413.2217,  69.0897, 453.0287, 103.7057]])}

In [None]:
# temp_ground_truth = ground_truth
# temp_results = results

In [None]:
# print(temp_ground_truth)

## this is original working code


1. commenting to avoid confusion

In [None]:

# TP, FP, FN = 0, 0, 0

# def final_result(image, ground_truth, results):
#     result_dict = []
#     for i in range(len(ground_truth[f'images/{image}']['labels'])):
#         label = ground_truth[f'images/{image}']['labels'][i]
#         # label = 2 
#         match = False
#         for j in range(len(results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\{image}']['labels'])):
#             pred_label = results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\{image}']['labels'][j]
#             if pred_label == label:
#                 # pred_label = 2
#                 # gt = x1, x2
#                 # pred = y1, y2
#                 iou = calculate_iou(ground_truth[f'images/{image}']['boxes'][i], results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\{image}']['boxes'][j])
#                 if iou > 0.6:
#                     result_dict.append({'label': label, 
#                                         'iou': iou, 
#                                         'result': 'TP',
#                                         'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
#                                         'prediction': results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\{image}']['boxes'][j]
#                                         })
#                     match = True
#                     results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\{image}']['labels'][j] = -1
                    
#                     continue
#                 elif iou < 0.6 or iou == 0:
#                     result_dict.append({'label': label, 
#                                         'iou': iou, 
#                                         'result': 'FP',
#                                         'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
#                                         'prediction': results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\{image}']['boxes'][j]
#                                         })
#                     # results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\{image}']['labels'][j] = -1
#                     # match = True
#         if match == False:
#             result_dict.append({'label': label, 
#                                 'iou': 0, 
#                                 'result': 'FN',
#                                 'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
#                                 'prediction': []
#                                 })
#             # results[f'images\\{image}']['labels'][j] = -1
                
#     return result_dict 
                    

In [None]:
# result_dict = final_result('2e4aefb0-drawing_15.png', temp_ground_truth, temp_results)

In [None]:
# result_dict

## TRIAL AND ERROR

In [29]:
import copy

ground_copy = copy.deepcopy(ground_truth)
results_copy = copy.deepcopy(results)



In [30]:
print(ground_copy['images/2e4aefb0-drawing_15.png']['labels'])
print(results_copy['D:\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\2e4aefb0-drawing_15.png']['labels'])

[3, 1, 0, 2, 2, 2]
tensor([1, 2, 0, 2, 3])


In [31]:
temp_ground_truth = ground_copy
temp_results = results_copy

In [32]:
print(temp_ground_truth)
print(temp_results)

{'images/3c56fa90-drawing_7.png': {'boxes': [[109.7142857142857, 133.71428571428572, 154.28571428571428, 176.57142857142856], [536.5714285714284, 363.42857142857144, 632.5714285714284, 407.99999999999994], [214.28571428571425, 517.7142857142858, 289.71428571428567, 548.5714285714287]], 'labels': [3, 1, 2]}, 'images/2e4aefb0-drawing_15.png': {'boxes': [[404.57142857142856, 65.14285714285714, 452.57142857142856, 104.57142857142857], [608.5714285714284, 366.85714285714283, 702.8571428571428, 407.9999999999999], [44.57142857142857, 533.1428571428572, 130.28571428571428, 557.1428571428572], [217.7142857142857, 143.99999999999997, 293.1428571428571, 162.85714285714283], [385.71428571428567, 471.4285714285715, 442.2857142857143, 500.57142857142856], [123.42857142857139, 452.5714285714286, 212.5714285714285, 478.28571428571433]], 'labels': [3, 1, 0, 2, 2, 2]}, 'images/777ab86f-drawing_22.png': {'boxes': [[666.8571428571428, 77.14285714285714, 711.4285714285714, 118.28571428571429], [545.142857

In [33]:
def final_result(image, ground_truth, results, TP, FP, FN):
    result_dict = []
    for i in range(len(ground_truth[f'images/{image}']['labels'])):
        label = ground_truth[f'images/{image}']['labels'][i]
        match = False
        for j in range(len(results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\{image}']['labels'])):
            pred_label = results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\{image}']['labels'][j]
            iou = calculate_iou(ground_truth[f'images/{image}']['boxes'][i], results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\{image}']['boxes'][j])
            if iou > 0:
                if pred_label == label:
                    result_dict.append({'label': label, 
                                        'iou': iou, 
                                        'result': 'TP',
                                        'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
                                        'prediction': results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\{image}']['boxes'][j]
                                        })
                    results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\{image}']['labels'][j] = -1
                    TP += 1
                    match = True
                    break
                else:
                    result_dict.append({'label': label, 
                                        'iou': iou, 
                                        'result': 'FP',
                                        'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
                                        'prediction': results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR DATA\\test\\images\\{image}']['boxes'][j]
                                        })
                    FP += 1
                    match = True
            
        if not match:
            result_dict.append({'label': label, 
                                'iou': 0, 
                                'result': 'FN',
                                'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
                                'prediction': []
                                })
            FN += 1
                
    return result_dict, TP, FP, FN



In [34]:
TP, FP, FN = 0, 0, 0
result_dict, TP, FP, FN = final_result('2e4aefb0-drawing_15.png', temp_ground_truth, temp_results, TP, FP, FN)

In [35]:
result_dict

[{'label': 3,
  'iou': tensor(0.7138),
  'result': 'TP',
  'ground_truth': [404.57142857142856,
   65.14285714285714,
   452.57142857142856,
   104.57142857142857],
  'prediction': tensor([413.2217,  69.0897, 453.0287, 103.7057])},
 {'label': 1,
  'iou': tensor(0.8130),
  'result': 'TP',
  'ground_truth': [608.5714285714284,
   366.85714285714283,
   702.8571428571428,
   407.9999999999999],
  'prediction': tensor([608.8925, 369.3228, 698.7352, 404.4241])},
 {'label': 0,
  'iou': tensor(0.6569),
  'result': 'TP',
  'ground_truth': [44.57142857142857,
   533.1428571428572,
   130.28571428571428,
   557.1428571428572],
  'prediction': tensor([ 40.2911, 527.2289, 125.3356, 554.7177])},
 {'label': 2,
  'iou': tensor(0.7303),
  'result': 'TP',
  'ground_truth': [217.7142857142857,
   143.99999999999997,
   293.1428571428571,
   162.85714285714283],
  'prediction': tensor([215.5658, 142.9829, 295.6338, 167.3094])},
 {'label': 2,
  'iou': tensor(0.8925),
  'result': 'TP',
  'ground_truth': [3

In [36]:
print(TP)
print(FP)
print(FN)

5
0
1
