In [1]:
import os 
import supervision as sv
from transformers import DetrForObjectDetection, DetrImageProcessor
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import DataLoader
from PIL import Image
import time
import torchvision
from torchvision.ops import box_iou
import torch
import pytorch_lightning
import cv2
import random
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
image_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-101")

In [None]:
dataset = r'D:\Data Science\DocScan-Research\Inference\DETR FINAL DATA'
ANNOTATION_FILE_NAME = r"result.json"
TRAIN_DIRECTORY = os.path.join(dataset, r"train")
VAL_DIRECTORY = os.path.join(dataset, r"val")
TEST_DIRECTORY = os.path.join(dataset, r"test")

class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(
        self,
        image_directory_path: str,
        image_processor,
        train: bool = True
    ):
        annotation_file_path = os.path.join(image_directory_path, ANNOTATION_FILE_NAME)
        super(CocoDetection, self).__init__(image_directory_path, annotation_file_path)
        self.image_processor = image_processor

    def __getitem__(self, idx):
        images, annotations = super(CocoDetection, self).__getitem__(idx)
        image_id = self.ids[idx]
        annotations = {'image_id': image_id, 'annotations': annotations}
        encoding = self.image_processor(images=images, annotations=annotations, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze()
        target = encoding["labels"][0]

        return pixel_values, target
    
TRAIN_DATASET = CocoDetection(
    image_directory_path=TRAIN_DIRECTORY,
    image_processor=image_processor,
    train=True)
VAL_DATASET = CocoDetection(
    image_directory_path=VAL_DIRECTORY,
    image_processor=image_processor,
    train=False)
TEST_DATASET = CocoDetection(
    image_directory_path=TEST_DIRECTORY,
    image_processor=image_processor,
    train=False)

print("Number of training examples:", len(TRAIN_DATASET))
print("Number of validation examples:", len(VAL_DATASET))
print("Number of test examples:", len(TEST_DATASET))

In [3]:
CHECKPOINT = "facebook/detr-resnet-50"

# Best Performing Model
MODEL_PATH = "D:\Data Science\DocScan-Research\Inference\DETR 11"


## Load Model
def loadModel(MODEL_PATH, CHECKPOINT):
    model = DetrForObjectDetection.from_pretrained(MODEL_PATH)
    image_processor = DetrImageProcessor.from_pretrained(CHECKPOINT)
    return model, image_processor

In [4]:
from transformers import DetrForObjectDetection
import torch
from collections import OrderedDict

# Initialize the model architecture
model, image_processor = loadModel(MODEL_PATH=MODEL_PATH, CHECKPOINT=CHECKPOINT)

In [5]:
checkpoint = torch.load("D:\Data Science\DocScan-Research\Inference\DETR 11\detr-epoch=99-val_loss=0.90.ckpt", map_location='cpu')

# # Get the state dict
state_dict = checkpoint['state_dict']

# # # Remove the 'model.model.' prefix from the state dict keys
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    name = k.replace("model.model.", "")
    new_state_dict[name] = v

# # # Load the modified state dict
model.load_state_dict(new_state_dict, strict=False)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print("Model loaded successfully!")

Model loaded successfully!


In [None]:
import json
from collections import defaultdict
 
def create_ground_truth_dict(json_file_path):
    # Read the JSON file
    with open(json_file_path, 'r') as f:
        data = json.load(f)
   
    # Create a mapping of image_id to file_name
    image_id_to_filename = {img['id']: img['file_name'] for img in data['images']}
   
    # Use defaultdict to automatically initialize empty dictionaries for new keys
    ground_truth = defaultdict(lambda: {'boxes': [], 'labels': []})
   
    # Process annotations
    for annotation in data['annotations']:
        image_id = annotation['image_id']
        filename = image_id_to_filename[image_id]
       
        # Extract bounding box coordinates
        x, y, width, height = annotation['bbox']
        box = [x, y, x + width, y + height]
       
        # Add box and label to the ground_truth dictionary
        ground_truth[filename]['boxes'].append(box)
        ground_truth[filename]['labels'].append(annotation['category_id'])
   
    # Convert defaultdict back to regular dict for final output
    return dict(ground_truth)


In [None]:
json_file_path = r'D:\Data Science\DocScan-Research\Inference\DETR FINAL DATA\test\result.json'
ground_truth = create_ground_truth_dict(json_file_path)

In [None]:
ground_truth

In [None]:
categories = TEST_DATASET.coco.cats
id2label = {k: v['name'] for k,v in categories.items()}
id2label

In [None]:
print(id2label[0])

In [6]:
def add_missing_label(image, save_path, labels):
    if labels:  # Only add text if there are missing labels
        draw = ImageDraw.Draw(image)
        font = ImageFont.load_default()
        text = f"Missing labels: {', '.join(map(str, labels))}"
        position = (10, 10)
        draw.text(position, text, fill="red", font=font)
    os.makedirs(os.path.dirname(save_path), exist_ok=True)  # Ensure the directory exists
    image.save(save_path)

In [7]:
from PIL import Image, ImageDraw, ImageFont
import os
import cv2
import torch

IMAGE_FOLDER = r'D:\Data Science\DocScan-Research\ExtractedImages2'
CONFIDENCE_THRESHOLD = 0.5
IOU_THRESHOLD = 0.5

def inference(image_folder, CONFIDENCE_THRESHOLD, IOU_THRESHOLD):
    results_dict = {}
    
    for img in os.listdir(image_folder):
        IMAGE_PATH = os.path.join(image_folder, img)
        print(f"Processing {IMAGE_PATH}")

        image = cv2.imread(IMAGE_PATH)
        inputs = image_processor(images=image, return_tensors='pt')

        # Move inputs to the same device as the model
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        # Get ground truth for this image
        # target = ground_truth.get(img, {'boxes': torch.empty((0, 4)), 'labels': torch.empty((0,), dtype=torch.long)})
        # target = {k: v for k, v in target.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            
            # Post-process
            target_sizes = torch.tensor([image.shape[:2]]).to(model.device)
            results = image_processor.post_process_object_detection(
                outputs=outputs,
                threshold=CONFIDENCE_THRESHOLD,
                target_sizes=target_sizes
            )[0]
        
        detections = sv.Detections.from_transformers(transformers_results=results)
        id2labels = {0: "bar-scale", 1: "color stamp", 2: "detail label", 3: "north sign"}
        # labels = [f"{id2label[class_id]} {confidence:.2f}" for _, confidence, class_id, _ in detections]
        # labels = id2labels
        print(set(detections.class_id)) 
        
        box_annotator = sv.BoxAnnotator()
        frame = box_annotator.annotate(scene=image, detections=detections)
        
        image = Image.fromarray(frame)
        image_path = f"Temp3/results/annotated_{img}"
        all_labels = {0, 1, 2, 3}
        label = all_labels - set(detections.class_id)
        add_missing_label(image, image_path, label) # type: ignore
        results_dict[IMAGE_PATH.replace('Temp3/', '')] = results
    return results_dict

results = inference(IMAGE_FOLDER, CONFIDENCE_THRESHOLD, IOU_THRESHOLD)


Processing D:\Data Science\DocScan-Research\ExtractedImages2\output_folder1.png
{0, 1, 2, 3}
Processing D:\Data Science\DocScan-Research\ExtractedImages2\output_folder2.png
{0, 1, 2, 3}
Processing D:\Data Science\DocScan-Research\ExtractedImages2\output_folder3.png
{0, 1, 2, 3}
Processing D:\Data Science\DocScan-Research\ExtractedImages2\output_folder4.png
{0, 2, 3}
Processing D:\Data Science\DocScan-Research\ExtractedImages2\output_folder5.png
{0, 1, 2}


In [None]:
results = inference(IMAGE_FOLDER, 0.5, 0.5)

In [None]:
results

In [None]:
type(results)

In [None]:
type(ground_truth)

In [None]:
print(ground_truth)

In [None]:
print(results)

In [None]:
print(len(ground_truth))
print(type(ground_truth))
print(len(results))
print(type(results))


In [None]:

def calculate_iou(box1, box2):
    """
    Calculate the Intersection over Union (IoU) of two bounding boxes.
    
    Parameters:
    - box1: (x1, y1, x2, y2) coordinates of the first bounding box
    - box2: (x1, y1, x2, y2) coordinates of the second bounding box
    
    Returns:
    - iou: Intersection over Union (IoU) value
    """
    
    # Unpack the coordinates of the two boxes
    x1_1, y1_1, x2_1, y2_1 = box1
    x1_2, y1_2, x2_2, y2_2 = box2
    
    # Calculate the (x, y) coordinates of the intersection rectangle
    xi1 = max(x1_1, x1_2)
    yi1 = max(y1_1, y1_2)
    xi2 = min(x2_1, x2_2)
    yi2 = min(y2_1, y2_2)
    
    # Calculate the area of the intersection rectangle
    inter_width = max(0, xi2 - xi1)
    inter_height = max(0, yi2 - yi1)
    inter_area = inter_width * inter_height
    
    # Calculate the area of both bounding boxes
    box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
    box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
    
    # Calculate the union area
    union_area = box1_area + box2_area - inter_area
    
    # Calculate the IoU
    iou = inter_area / union_area if union_area != 0 else 0
    
    return iou

In [None]:
ground_truth.keys()

In [None]:
print(ground_truth)

In [None]:
print(list(ground_truth.keys())[0])

In [None]:
ground_truth['images/a9720cda-drawing_76.png']['labels']

In [None]:
ground_truth['images/a9720cda-drawing_76.png']

In [None]:
print(list(results.keys()))

In [None]:
results

In [None]:
results['D:\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\a9720cda-drawing_76.png']['labels']

In [None]:
results['D:\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\a9720cda-drawing_76.png']

## TRIAL AND ERROR

In [None]:
import copy

ground_copy = copy.deepcopy(ground_truth)
results_copy = copy.deepcopy(results)



In [None]:
print(ground_copy['images/a9720cda-drawing_76.png']['labels'])
print(results_copy['D:\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\a9720cda-drawing_76.png']['labels'])

In [None]:
temp_ground_truth = ground_copy
temp_results = results_copy

In [None]:
print(temp_ground_truth)
print(temp_results)

In [None]:
def final_result(image, ground_truth, results, TP, FP, FN):
    result_dict = []
    for i in range(len(ground_truth[f'images/{image}']['labels'])):
        label = ground_truth[f'images/{image}']['labels'][i]
        match = False
        for j in range(len(results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['labels'])):
            pred_label = results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['labels'][j]
            iou = calculate_iou(ground_truth[f'images/{image}']['boxes'][i], results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['boxes'][j])
            if iou > 0:
                if pred_label == label:
                    result_dict.append({'label': label, 
                                        'iou': iou, 
                                        'result': 'TP',
                                        'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
                                        'prediction': results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['boxes'][j]
                                        })
                    results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['labels'][j] = -1
                    TP += 1
                    match = True
                    break
                else:
                    result_dict.append({'label': label, 
                                        'iou': iou, 
                                        'result': 'FP',
                                        'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
                                        'prediction': results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['boxes'][j]
                                        })
                    FP += 1
                    match = True
            
        if not match:
            result_dict.append({'label': label, 
                                'iou': 0, 
                                'result': 'FN',
                                'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
                                'prediction': []
                                })
            FN += 1
                
    return result_dict, TP, FP, FN



In [None]:
TP, FP, FN = 0, 0, 0
result_dict, TP, FP, FN = final_result('1eecb4f3-drawing_25.png', temp_ground_truth, temp_results, TP, FP, FN)

In [None]:
result_dict

In [None]:
print(TP)
print(FP)
print(FN)

On all the images at once 

In [None]:
def final_result(image, ground_truth, results, TP, FP, FN):
    result_dict = []
    for i in range(len(ground_truth[f'images/{image}']['labels'])):
        label = ground_truth[f'images/{image}']['labels'][i]
        match = False
        for j in range(len(results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['labels'])):
            pred_label = results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['labels'][j]
            iou = calculate_iou(ground_truth[f'images/{image}']['boxes'][i], results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['boxes'][j])
            if iou > 0:
                if pred_label == label:
                    result_dict.append({'label': label, 
                                        'iou': iou, 
                                        'result': 'TP',
                                        'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
                                        'prediction': results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['boxes'][j]
                                        })
                    results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['labels'][j] = -1
                    TP += 1
                    match = True
                    break
                else:
                    result_dict.append({'label': label, 
                                        'iou': iou, 
                                        'result': 'FP',
                                        'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
                                        'prediction': results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['boxes'][j]
                                        })
                    FP += 1
                    match = True
            
        if not match:
            result_dict.append({'label': label, 
                                'iou': 0, 
                                'result': 'FN',
                                'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
                                'prediction': []
                                })
            FN += 1
                
    return result_dict, TP, FP, FN

In [None]:
def evaluate_all_images(ground_truth, results):
    TP, FP, FN = 0, 0, 0
    all_results = []
    
    for image in ground_truth.keys():
        image = image.split('/')[-1]
        result_dict, TP, FP, FN = final_result(image, temp_ground_truth, temp_results, TP, FP, FN)
        all_results.extend(result_dict)
    
    return all_results, TP, FP, FN


In [None]:
all_results, TP, FP, FN = evaluate_all_images(temp_ground_truth, temp_results)

print(f'Total True Positives (TP): {TP}')
print(f'Total False Positives (FP): {FP}')
print(f'Total False Negatives (FN): {FN}')
print('Detailed results for each box:')
for result in all_results:
    print(result)

LABEL WISE TP ,FP ,FN

In [None]:
from collections import defaultdict

# def calculate_iou(box1, box2):
#     x1, y1, x2, y2 = box1
#     x1_p, y1_p, x2_p, y2_p = box2

#     xi1, yi1 = max(x1, x1_p), max(y1, y1_p)
#     xi2, yi2 = min(x2, x2_p), min(y2, y2_p)
#     inter_area = max(0, xi2 - xi1 + 1) * max(0, yi2 - yi1 + 1)

#     box1_area = (x2 - x1 + 1) * (y2 - y1 + 1)
#     box2_area = (x2_p - x1_p + 1) * (y2_p - y1_p + 1)
#     union_area = box1_area + box2_area - inter_area

#     iou = inter_area / union_area
#     return iou

def final_result(image, ground_truth, results, TP, FP, FN, class_TP, class_FP, class_FN):
    result_dict = []
    for i in range(len(ground_truth[f'images/{image}']['labels'])):
        label = ground_truth[f'images/{image}']['labels'][i]
        match = False
        for j in range(len(results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['labels'])):
            pred_label = results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['labels'][j]
            iou = calculate_iou(ground_truth[f'images/{image}']['boxes'][i], results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['boxes'][j])
            if iou > 0:
                if pred_label == label:
                    result_dict.append({'label': label, 
                                        'iou': iou, 
                                        'result': 'TP',
                                        'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
                                        'prediction': results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['boxes'][j]
                                        })
                    results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['labels'][j] = -1
                    TP += 1
                    class_TP[label] += 1
                    match = True
                    break
                else:
                    result_dict.append({'label': label, 
                                        'iou': iou, 
                                        'result': 'FP',
                                        'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
                                        'prediction': results[f'D:\\Data Science\\DocScan-Research\\Inference\\DETR FINAL DATA\\test\\images\\{image}']['boxes'][j]
                                        })
                    FP += 1
                    class_FP[label] += 1
                    match = True
            
        if not match:
            result_dict.append({'label': label, 
                                'iou': 0, 
                                'result': 'FN',
                                'ground_truth': ground_truth[f'images/{image}']['boxes'][i],
                                'prediction': []
                                })
            FN += 1
            class_FN[label] += 1
                
    return result_dict, TP, FP, FN, class_TP, class_FP, class_FN

def evaluate_all_images(ground_truth, results):
    TP, FP, FN = 0, 0, 0
    class_TP, class_FP, class_FN = defaultdict(int), defaultdict(int), defaultdict(int)
    all_results = []
    
    for image in ground_truth.keys():
        image = image.split('/')[-1]
        result_dict, TP, FP, FN, class_TP, class_FP, class_FN = final_result(image, ground_truth, results, TP, FP, FN, class_TP, class_FP, class_FN)
        all_results.extend(result_dict)
    
    return all_results, TP, FP, FN, class_TP, class_FP, class_FN

In [None]:
all_results, TP, FP, FN, class_TP, class_FP, class_FN = evaluate_all_images(ground_truth, results)

print(f'Total TP: {TP}, Total FP: {FP}, Total FN: {FN}')
print(f'Class-wise TP: {dict(class_TP)}, Class-wise FP: {dict(class_FP)}, Class-wise FN: {dict(class_FN)}')
