# Object detection

## Load the dataset

In [14]:
import os 
from picsellia import Client

import torch
from PIL import Image

from datasets import load_dataset
from picsellia.types.enums import AnnotationFileType, InferenceType
from transformers import AutoModelForObjectDetection, TrainingArguments, AutoImageProcessor
from transformers import Trainer
from transformers import pipeline, TrainerCallback

from utils import read_annotation_file, format_coco_annot_to_jsonlines_format, write_metadata_file, custom_train_test_split, transform_aug_ann, collate_fn, save_annotation_file_images, format_evaluation_results, run_evaluation, CocoDetection, get_dataset_image_ids, get_filenames_by_ids, evaluate_asset

In [2]:
api_token = "7ef72b2e908e1a7e931625a2527828e2352807ef"
client = Client(api_token=api_token, organization_name="hajer")
experiment = client.get_experiment_by_id('018a1d77-3e0e-77d7-a6c3-7f10942da0bd')

Hi [94mhajer[0m, welcome back. 🥑
Workspace: [93myour[0m organization.


In [3]:
dataset_list = experiment.list_attached_dataset_versions()
dataset = dataset_list[0]
data_dir = os.path.join(experiment.base_dir, "data")
dataset.download(data_dir)

In [4]:
annotations, annotation_file_path = read_annotation_file(dataset=dataset, target_path=data_dir)
formatted_coco = format_coco_annot_to_jsonlines_format(annotations=annotations)
write_metadata_file(data=formatted_coco, output_path=os.path.join(data_dir,'metadata.jsonl'))

In [5]:
loaded_dataset  = load_dataset("imagefolder", data_dir=data_dir)

Resolving data files:   0%|          | 0/157 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/156 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
train_test_valid_dataset = custom_train_test_split(loaded_dataset=loaded_dataset, test_prop=0.15)

In [7]:
train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'image_id', 'width', 'height', 'objects'],
        num_rows: 131
    })
    test: Dataset({
        features: ['image', 'image_id', 'width', 'height', 'objects'],
        num_rows: 12
    })
    eval: Dataset({
        features: ['image', 'image_id', 'width', 'height', 'objects'],
        num_rows: 12
    })
})

In [8]:
categories = [cat['name'] for cat in annotations['categories']] 
id2label = {index: x for index, x in enumerate(categories, start=0)}
label2id = {v: k for k, v in id2label.items()}
labelmap = {str(i): category for i, category in enumerate(categories)}
experiment.log("labelmap", labelmap, "labelmap", replace=True)

[92mLog labelmap[0m (id: 018a1d7c-38ff-7684-850d-379d9499de94)

## Preprocess the data

In [9]:
checkpoint = "facebook/detr-resnet-50"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [10]:
train_test_valid_dataset["train"] = train_test_valid_dataset["train"].with_transform(transform_aug_ann)

In [None]:
# for sample in range(len(train_test_valid_dataset['train'])):
#     print(sample)
#     print(train_test_valid_dataset["train"][sample])

## in case there are images with degenerated bowes, remove them 
# remove_idx = [5325]
# keep = [i for i in range(len(train_test_valid_dataset["train"])) if i not in remove_idx]
# train_test_valid_dataset["train"] = train_test_valid_dataset["train"].select(keep)

## Training the DETR model

In [11]:
model = AutoModelForObjectDetection.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

Downloading model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DetrForObjectDetection were not initialized from the model checkpoin

In [12]:
output_model_dir = os.path.join(experiment.checkpoint_dir)

In [None]:
training_args = TrainingArguments(
    output_dir=output_model_dir,
    per_device_train_batch_size=8,
    num_train_epochs=30,
    fp16=True,
    save_steps=200,
    logging_steps=50,
    lr_scheduler_type='constant',
    learning_rate=1e-5,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
)

In [51]:
class LogObjectDetectionMetricsCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):

        if state.is_local_process_zero:
            for metric_name, value in logs.items():
                if value < 1000:

                    if metric_name in ['train_loss', 'total_flos', 'train_steps_per_second', 'train_samples_per_second',
                                       'train_runtime']:
                        experiment.log(str(metric_name), float(value), 'value')
                    else:
                        experiment.log(str(metric_name), float(value), 'line')

In [52]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=train_test_valid_dataset["train"],
    tokenizer=image_processor,
    callbacks=[LogObjectDetectionMetricsCallback]
)
trainer.train()




Step,Training Loss
50,4.5809
100,3.8208
150,3.5168
200,3.4854
250,3.3091
300,3.2076
350,3.1507
400,3.1179


TrainOutput(global_step=420, training_loss=3.4973063241867792, metrics={'train_runtime': 520.0942, 'train_samples_per_second': 6.23, 'train_steps_per_second': 0.808, 'total_flos': 1.5484674023424e+18, 'train_loss': 3.4973063241867792, 'epoch': 30.0})

In [53]:
trainer.save_model(output_dir=output_model_dir)

## Evaluate

Object detection models are commonly evaluated with a set of <a href="https://cocodataset.org/#detection-eval">COCO-style metrics</a>.
You can use one of the existing metrics implementations, but here you'll use the one from `torchvision` to evaluate the final
model that you pushed to the Hub.

To use the `torchvision` evaluator, you'll need to prepare a ground truth COCO dataset. The API to build a COCO dataset
requires the data to be stored in a certain format, so you'll need to save images and annotations to disk first. Just like
when you prepared your data for training, the annotations from the `dataset["test"]` need to be formatted. However, images
should stay as they are.

The evaluation step requires a bit of work, but it can be split in three major steps.
First, prepare the `dataset["test"]` set: format the annotations and save the data to disk.

Next, prepare an instance of a `CocoDetection` class that can be used with `cocoevaluator`.

In [56]:
im_processor = AutoImageProcessor.from_pretrained(output_model_dir)
path_output, path_anno = save_annotation_file_images(dataset=train_test_valid_dataset["test"], experiment=experiment, id2label=id2label)
test_ds_coco_format = CocoDetection(path_output, im_processor, path_anno)

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


In [78]:
model = AutoModelForObjectDetection.from_pretrained(output_model_dir)

100%|██████████| 3/3 [00:31<00:00, 10.59s/it]


Accumulating evaluation results...
DONE (t=0.07s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.012
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.036
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.007
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.005
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.021
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.079
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.005
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.019
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.041
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.020
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.051
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= la

In [62]:
results = run_evaluation(test_ds_coco_format=test_ds_coco_format, im_processor=im_processor, model=model)
casted_results = format_evaluation_results(results=results)
experiment.log(name='evaluation metrics', type='table', data=casted_results)

[92mLog evaluation metrics[0m (id: 018a21a3-74f6-7e27-aa46-f2df84f60aa5)

## Inference

In [41]:
# for one image
image_path = "/home/ubuntu/dev/vision-transformers/grape-detector/data/SYH_2017-04-27_1291.jpg"
image = Image.open(image_path)

In [42]:
image_processor = AutoImageProcessor.from_pretrained(output_model_dir)
model = AutoModelForObjectDetection.from_pretrained(output_model_dir)

with torch.no_grad():
    inputs = image_processor(images=image, return_tensors="pt")
    outputs = model(**inputs)

    target_sizes = torch.tensor([image.size[::-1]])
    results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]


for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {model.config.id2label[label.item()]} with confidence "
        f"{round(score.item(), 3)} at location {box}"
    )

Detected grape with confidence 0.887 at location [174.96, 798.94, 337.52, 1209.99]
Detected grape with confidence 0.777 at location [407.09, 592.26, 531.31, 750.52]
Detected grape with confidence 0.946 at location [170.18, 798.43, 315.74, 1022.5]
Detected grape with confidence 0.608 at location [1563.34, 825.6, 1672.37, 926.45]
Detected grape with confidence 0.504 at location [892.52, 732.31, 997.44, 945.75]
Detected grape with confidence 0.619 at location [206.25, 389.26, 370.77, 628.61]
Detected grape with confidence 0.949 at location [1373.77, 612.05, 1521.82, 1013.14]
Detected grape with confidence 0.603 at location [1376.86, 597.86, 1513.69, 907.13]
Detected grape with confidence 0.896 at location [1252.14, 508.41, 1406.99, 708.07]
Detected grape with confidence 0.922 at location [403.47, 594.29, 550.72, 964.19]
Detected grape with confidence 0.604 at location [499.73, 709.88, 637.18, 902.48]
Detected grape with confidence 0.775 at location [216.86, 1023.18, 337.33, 1208.56]
Detec

In [72]:
image_processor = AutoImageProcessor.from_pretrained(output_model_dir)
model = AutoModelForObjectDetection.from_pretrained(output_model_dir)
dataset_labels = {label.name: label for label in dataset.list_labels()}

In [65]:
eval_image_ids = get_dataset_image_ids(train_test_valid_dataset, "eval")
id2filename_eval = get_filenames_by_ids(image_ids=eval_image_ids, annotations=annotations)

In [None]:
for file_path in list(id2filename_eval.values()):
    evaluate_asset(file_path=file_path)

In [76]:
experiment.compute_evaluations_metrics(inference_type=InferenceType.OBJECT_DETECTION)

JobV1 018a21ab-e898-7f6c-bb34-dafa9f10f195: JobStatus.RUNNING