In [1]:
import torch, detectron2
!nvcc --version
TORCH_VERSION = ".".join(torch.__version__.split(".")[:2])
CUDA_VERSION = torch.__version__.split("+")[-1]
print("torch: ", TORCH_VERSION, "; cuda: ", CUDA_VERSION)
print("detectron2:", detectron2.__version__)

torch:  2.4 ; cuda:  cpu
detectron2: 0.6


'nvcc' �́A�����R�}���h�܂��͊O���R�}���h�A
����\�ȃv���O�����܂��̓o�b�` �t�@�C���Ƃ��ĔF������Ă��܂���B


In [2]:
# Some basic setup:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import os, json, cv2, random

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer

##
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.structures import BoxMode

In [None]:
def get_hands_dataset_function(directory, class_labels):
    dataset_dicts = []
    json_files = [f for f in os.listdir(directory) if f.endswith(".json")] # JSON FILES
    for idx, json_file in enumerate(json_files):
        json_path = os.path.join(directory, json_file)
        with open(json_path) as f:
            data = json.load(f)

        record = {}
        img_filename = data["imagePath"]
        img_path = os.path.join(directory, img_filename)
        height, width = data["imageHeight"], data["imageWidth"]

        record["file_name"] = img_path
        record["image_id"] = idx
        record["height"] = height
        record["width"] = width
    
        objs = []
        for shape in data["shapes"]:
            label = shape["label"]
            if label not in class_labels:
                continue # Skip unknown labels.

            points = np.array(shape["points"], dtype=np.float32)
            px = points[:, 0].astype(float)
            py = points[:, 1].astype(float)

            # Create segmentation polygon
            poly = [(float(x), float(y)) for x, y in zip(px,py)]
            poly = [p for x in poly for p in x] # Flatten list
            x_min, y_min = float(px.min()), float(py.min())
            x_max, y_max = float(px.max()), float(py.max())

            obj = {
                "bbox": [x_min, y_min, x_max, y_max],
                "bbox_mode": BoxMode.XYXY_ABS,
                "segmentation": [poly],
                "category_id": class_labels.index(label),
            }

            objs.append(obj)
        record["annotations"] = objs
        dataset_dicts.append(record)

    print(f"Loaded {len(dataset_dicts)} labeled images from {directory}.")
    return dataset_dicts

Registering Datasets

In [4]:
class_labels = ["0", "1", "2", "3", "4", "5"]

DatasetCatalog.register("hands_train", lambda: get_hands_dataset_function("hand_dataset/train/", class_labels))
MetadataCatalog.get("hands_train").set(thing_classes = class_labels)

DatasetCatalog.register("hands_val", lambda: get_hands_dataset_function("hand_dataset/val/", class_labels))
MetadataCatalog.get("hands_val").set(thing_classes = class_labels)

# Metadata just store static info like class names and color maps.
hands_metadata = MetadataCatalog.get("hands_train") 

Check if all the data is properly loaded.

In [5]:
dataset_dicts = get_hands_dataset_function("hand_dataset/train/", class_labels)
print(f"Loaded {len(dataset_dicts)} images.")
for d in dataset_dicts:
    print(d["file_name"])

dataset_dicts = get_hands_dataset_function("hand_dataset/val/", class_labels)
print(f"Loaded {len(dataset_dicts)} images.")
for d in dataset_dicts:
    print(d["file_name"])



Loaded 36 labeled images from hand_dataset/train/.
Loaded 36 images.
hand_dataset/train/0_0.jpg
hand_dataset/train/0_1.jpg
hand_dataset/train/0_2.jpg
hand_dataset/train/0_3.jpg
hand_dataset/train/0_4.jpg
hand_dataset/train/0_5.jpg
hand_dataset/train/1_0.jpg
hand_dataset/train/1_1.jpg
hand_dataset/train/1_2.jpg
hand_dataset/train/1_3.jpg
hand_dataset/train/1_4.jpg
hand_dataset/train/1_5.jpg
hand_dataset/train/2_0.jpg
hand_dataset/train/2_1.jpg
hand_dataset/train/2_2.jpg
hand_dataset/train/2_3.jpg
hand_dataset/train/2_4.jpg
hand_dataset/train/2_5.jpg
hand_dataset/train/3_0.jpg
hand_dataset/train/3_1.jpg
hand_dataset/train/3_2.jpg
hand_dataset/train/3_3.jpg
hand_dataset/train/3_4.jpg
hand_dataset/train/3_5.jpg
hand_dataset/train/4_0.jpg
hand_dataset/train/4_1.jpg
hand_dataset/train/4_2.jpg
hand_dataset/train/4_3.jpg
hand_dataset/train/4_4.jpg
hand_dataset/train/4_5.jpg
hand_dataset/train/5_0.jpg
hand_dataset/train/5_1.jpg
hand_dataset/train/5_2.jpg
hand_dataset/train/5_3.jpg
hand_dataset/

Pull up some sample data for visualization

In [6]:
for d in random.sample(dataset_dicts, 3):
    img = cv2.imread(d["file_name"])
    visualizer = Visualizer(img[:, :, ::-1], metadata=hands_metadata, scale=0.5)
    out = visualizer.draw_dataset_dict(d)
    cv2.imshow("Sample Hands",out.get_image()[:, :, ::-1])
    cv2.waitKey(0)
    cv2.destroyAllWindows()

In [7]:
dataset_dicts = get_hands_dataset_function("hand_dataset/train/", ["0","1","2","3","4","5"])
for d in dataset_dicts[:3]:
    for a in d["annotations"]:
        x0, y0, x1, y1 = a["bbox"]
        print(f"{d['file_name']} width={x1-x0:.1f}, height={y1-y0:.1f}")

Loaded 36 labeled images from hand_dataset/train/.
hand_dataset/train/0_0.jpg width=261.7, height=530.4
hand_dataset/train/0_1.jpg width=484.3, height=435.7
hand_dataset/train/0_1.jpg width=113.9, height=88.7
hand_dataset/train/0_2.jpg width=461.7, height=405.2


Setting up Training Configurations

In [None]:
from detectron2.engine import DefaultTrainer
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("hands_train",)
cfg.DATASETS.TEST = ("hands_val",)
cfg.DATALOADER.NUM_WORKERS = 2

# Transfer learning from pretrained COCO Instance segmentation model
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")  # Let training initialize from model zoo

cfg.SOLVER.IMS_PER_BATCH = 2  # This is the real "batch size" commonly known to deep learning people
cfg.SOLVER.BASE_LR = 0.00025  # pick a good LR
cfg.SOLVER.MAX_ITER = 5000    # 300 iterations seems good enough for this toy dataset; you will need to train longer for a practical dataset
cfg.SOLVER.STEPS = []        # do not decay learning rate
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128   # The "RoIHead batch size". 128 is faster, and good enough for this toy dataset (default: 512)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 6  # only has one class (ballon). (see https://detectron2.readthedocs.io/tutorials/datasets.html#update-the-config-for-new-datasets)
# NOTE: this config means the number of classes, but a few popular unofficial tutorials incorrect uses num_classes+1 here.
cfg.OUTPUT_DIR = "handsydansy_output"
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
cfg.MODEL.DEVICE = "cpu" ## CUDA OR CPU!!!!
trainer = DefaultTrainer(cfg) 

[32m[11/10 11:41:47 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

Actual Training

In [9]:
from detectron2.data import DatasetCatalog
dataset_dicts = DatasetCatalog.get("hands_train")
print(len(dataset_dicts))
print(dataset_dicts[0])



Loaded 36 labeled images from hand_dataset/train/.
36
{'file_name': 'hand_dataset/train/0_0.jpg', 'image_id': 0, 'height': 550, 'width': 550, 'annotations': [{'bbox': [182.3913116455078, 14.1304349899292, 444.13043212890625, 544.5652465820312], 'bbox_mode': <BoxMode.XYXY_ABS: 0>, 'segmentation': [[196.30435, 542.8261, 210.21739, 388.91306, 185.86957, 244.56522, 182.39131, 147.17392, 222.39131, 52.391304, 265.86957, 14.130435, 352.82608, 17.608696, 383.26086, 53.260868, 397.17392, 51.52174, 444.13043, 120.21739, 422.3913, 227.17392, 371.9565, 315.86957, 348.47827, 401.9565, 349.34784, 544.56525]], 'category_id': 0}]}


In [10]:
trainer.resume_or_load(resume=False)
trainer.train()
# TRAIN

[32m[11/10 09:35:17 d2.checkpoint.detection_checkpoint]: [0m[DetectionCheckpointer] Loading from https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl ...


Skip loading parameter 'roi_heads.box_predictor.cls_score.weight' to the model due to incompatible shapes: (81, 1024) in the checkpoint but (7, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.cls_score.bias' to the model due to incompatible shapes: (81,) in the checkpoint but (7,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.weight' to the model due to incompatible shapes: (320, 1024) in the checkpoint but (24, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.bias' to the model due to incompatible shapes: (320,) in the checkpoint but (24,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.mask_head.predictor.weight' to the model due to incompatible shapes: (80, 256, 1, 1) in the checkpoint but (6, 256, 1, 1) 

[32m[11/10 09:35:18 d2.engine.train_loop]: [0mStarting training from iteration 0


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


[32m[11/10 09:35:33 d2.utils.events]: [0m eta: 0:32:24  iter: 19  total_loss: 3.236  loss_cls: 1.908  loss_box_reg: 0.612  loss_mask: 0.6929  loss_rpn_cls: 0.001439  loss_rpn_loc: 0.0055    time: 0.3933  last_time: 0.3661  data_time: 0.2966  last_data_time: 0.0015   lr: 4.9953e-06  max_mem: 1769M
[32m[11/10 09:35:42 d2.utils.events]: [0m eta: 0:32:56  iter: 39  total_loss: 3.039  loss_cls: 1.702  loss_box_reg: 0.6392  loss_mask: 0.6927  loss_rpn_cls: 0.001959  loss_rpn_loc: 0.00523    time: 0.4014  last_time: 0.4037  data_time: 0.0017  last_data_time: 0.0019   lr: 9.9902e-06  max_mem: 1769M
[32m[11/10 09:35:50 d2.utils.events]: [0m eta: 0:33:40  iter: 59  total_loss: 2.539  loss_cls: 1.298  loss_box_reg: 0.5865  loss_mask: 0.6894  loss_rpn_cls: 0.001173  loss_rpn_loc: 0.004486    time: 0.4097  last_time: 0.3861  data_time: 0.0018  last_data_time: 0.0018   lr: 1.4985e-05  max_mem: 1771M
[32m[11/10 09:35:59 d2.utils.events]: [0m eta: 0:34:12  iter: 79  total_loss: 2.118  loss_cls

In [10]:
# Inference should use the config with parameters that are used in training
# cfg now already contains everything we've set previously. We changed it a little bit for inference:
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")  # path to the model we just trained
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5   # set a custom testing threshold
predictor = DefaultPredictor(cfg)

[32m[11/10 11:42:06 d2.checkpoint.detection_checkpoint]: [0m[DetectionCheckpointer] Loading from handsydansy_output\model_final.pth ...


  return torch.load(f, map_location=torch.device("cpu"))


Visualization and Result Analysis

In [16]:
# For test dataset without labelling.
def get_test_dataset_function(directory):
    dataset_dicts = []
    img_files = [f for f in os.listdir(directory) if f.endswith((".jpg", ".png", ".jpeg"))] # JSON FILES
    for idx, img_file in enumerate(img_files):
        img_path = os.path.join(directory, img_file)
        im = cv2.imread(img_path)
        height, width = im.shape[:2]

        record = {
            "file_name" : img_path,
            "image_id" : idx,
            "height": height,
            "width" : width
        }

        dataset_dicts.append(record)

    print(f"Loaded {len(dataset_dicts)} test images from {directory}.")
    return dataset_dicts

In [None]:
# Test on Validation Dataset
from detectron2.utils.visualizer import ColorMode
import time

dataset_dicts = get_hands_dataset_function("hand_dataset/val/", class_labels)
for d in random.sample(dataset_dicts, 3):    
    im = cv2.imread(d["file_name"])
    start = time.time()
    outputs = predictor(im)  # format is documented at https://detectron2.readthedocs.io/tutorials/models.html#model-output-format
    end = time.time()
    print(outputs['instances'])
    v = Visualizer(im[:, :, ::-1],
                   metadata=hands_metadata, 
                   scale=1, 
                   instance_mode=ColorMode.IMAGE_BW   # remove the colors of unsegmented pixels. This option is only available for segmentation models
    )
    print(f"Time taken: {end-start:.3f}s")
    print(outputs['instances'])
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    cv2.imshow("Hands Validation XDDD",out.get_image()[:, :, ::-1])
    cv2.waitKey(0)
    cv2.destroyAllWindows()

Loaded 0 labeled images from hand_dataset/test/.


ValueError: Sample larger than population or is negative

In [18]:
# Test on Test Dataset (No labelling)
from detectron2.utils.visualizer import ColorMode
import time

dataset_dicts = get_test_dataset_function("hand_dataset/test/")
for d in random.sample(dataset_dicts, 10):    
    im = cv2.imread(d["file_name"])
    start = time.time()
    outputs = predictor(im)  # format is documented at https://detectron2.readthedocs.io/tutorials/models.html#model-output-format
    end = time.time()
    print(outputs['instances'])
    v = Visualizer(im[:, :, ::-1],
                   metadata=hands_metadata, 
                   scale=1, 
                   instance_mode=ColorMode.IMAGE_BW   # remove the colors of unsegmented pixels. This option is only available for segmentation models
    )
    print(f"Time taken: {end-start:.3f}s")
    print(outputs['instances'])
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    cv2.imshow("Hands Validation XDDD",out.get_image()[:, :, ::-1])
    cv2.waitKey(0)
    cv2.destroyAllWindows()

Loaded 12 test images from hand_dataset/test/.
Instances(num_instances=1, image_height=550, image_width=550, fields=[pred_boxes: Boxes(tensor([[161.2099,  63.7225, 411.9579, 549.3049]])), scores: tensor([0.9937]), pred_classes: tensor([3]), pred_masks: tensor([[[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]]])])
Time taken: 0.901s
Instances(num_instances=1, image_height=550, image_width=550, fields=[pred_boxes: Boxes(tensor([[161.2099,  63.7225, 411.9579, 549.3049]])), scores: tensor([0.9937]), pred_classes: tensor([3]), pred_masks: tensor([[[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  

In [19]:
# ------------------- EVALUATION -------------------
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader
import os

# Make sure the output dir exists
eval_output_dir = os.path.join(cfg.OUTPUT_DIR, "output_eval")
os.makedirs(eval_output_dir, exist_ok=True)

# Create COCOEvaluator
evaluator = COCOEvaluator("hands_val", output_dir=eval_output_dir)

# Build the validation data loader
val_loader = build_detection_test_loader(cfg, "hands_val")

# Run inference & print results
metrics = inference_on_dataset(predictor.model, val_loader, evaluator)
print("Evaluation results:", metrics)

# Optional: equivalent using trainer.test()
# results = trainer.test(cfg, trainer.model, evaluators=[evaluator])
# print("Trainer test results:", results)


[32m[11/10 11:54:35 d2.evaluation.coco_evaluation]: [0mFast COCO eval is not built. Falling back to official COCO eval.
[32m[11/10 11:54:35 d2.evaluation.coco_evaluation]: [0mTrying to convert 'hands_val' to COCO format ...
[32m[11/10 11:54:35 d2.data.datasets.coco]: [0mConverting annotations of dataset 'hands_val' to COCO format ...)
Loaded 12 labeled images from hand_dataset/val/.
[32m[11/10 11:54:35 d2.data.datasets.coco]: [0mConverting dataset dicts into COCO format
[32m[11/10 11:54:35 d2.data.datasets.coco]: [0mConversion finished, #images: 12, #annotations: 12
[32m[11/10 11:54:35 d2.data.datasets.coco]: [0mCaching COCO format annotations at 'handsydansy_output\output_eval\hands_val_coco_format.json' ...


TypeError: Object of type float32 is not JSON serializable