In [15]:
import os
import shutil
import datasets
import ultralytics
import supervision as sv

from tqdm import tqdm


In [2]:
fake_ds = sv.DetectionDataset.from_yolo("datasets/scc_cell_detection_fake/train/images", "datasets/scc_cell_detection_fake/train/labels", "datasets/scc_cell_detection_fake/data.yaml")
print(len(fake_ds))

2500


In [12]:
real_ds = datasets.load_dataset("mario-dg/brightfield-microscopy-scc-filtered", cache_dir=".cache/")
print(real_ds)

DatasetDict({
    train: Dataset({
        features: ['image', 'label', 'width', 'height', 'objects', 'well_edge'],
        num_rows: 20558
    })
    validation: Dataset({
        features: ['image', 'label', 'width', 'height', 'objects', 'well_edge'],
        num_rows: 2527
    })
    test: Dataset({
        features: ['image', 'label', 'width', 'height', 'objects', 'well_edge'],
        num_rows: 1200
    })
})


In [39]:
splits = real_ds.keys()
shutil.rmtree("datasets/scc_cell_detection_real", ignore_errors=True)
for split in splits:
    ds_dir = f"datasets/scc_cell_detection_real/{split}"
    images_dir = f"{ds_dir}/images"
    labels_dir = f"{ds_dir}/labels"
    os.makedirs(ds_dir, exist_ok=True)
    os.makedirs(images_dir, exist_ok=True)
    os.makedirs(labels_dir, exist_ok=True)
    for index, item in tqdm(enumerate(real_ds[split]), desc=split, total=len(real_ds[split])):
        image_file = f"{ds_dir}/images/real_cell_image_{index:04d}.png"
        item["image"].save(image_file)
        with open(f"{ds_dir}/labels/real_cell_image_{index:04d}.txt", 'w') as f:
            for det_index, det in enumerate(item['objects']['bbox']):
                f.write(f"{item['objects']['categories'][det_index]} {det[0]} {det[1]} {det[2]} {det[3]}\n")
        if split == "train" and index == 4999:
            break

Images:  24%|██▍       | 5000/20558 [05:48<18:03, 14.36it/s]
Images: 100%|██████████| 2527/2527 [02:56<00:00, 14.32it/s]
Images: 100%|██████████| 1200/1200 [01:23<00:00, 14.35it/s]


In [4]:
real_train_ds = sv.DetectionDataset.from_yolo("datasets/scc_cell_detection_real/train/images", "datasets/scc_cell_detection_real/train/labels", "datasets/scc_cell_detection_real/data.yaml")
real_val_ds = sv.DetectionDataset.from_yolo("datasets/scc_cell_detection_real/validation/images", "datasets/scc_cell_detection_real/validation/labels", "datasets/scc_cell_detection_real/data.yaml")
real_test_ds = sv.DetectionDataset.from_yolo("datasets/scc_cell_detection_real/test/images", "datasets/scc_cell_detection_real/test/labels", "datasets/scc_cell_detection_real/data.yaml")
print(len(real_train_ds))
print(len(real_val_ds))
print(len(real_test_ds))

5000
2527
1200


In [5]:
REAL_10 = int(5000 * 0.9)
REAL_30 = int(5000 * 0.7)
REAL_50 = int(5000 * 0.5)
FAKE_10 = 5000 - REAL_10
FAKE_30 = 5000 - REAL_30
FAKE_50 = 5000 - REAL_50

print(f"{REAL_10=} {FAKE_10=}")
print(f"{REAL_30=} {FAKE_30=}")
print(f"{REAL_50=} {FAKE_50=}")

REAL_10=4500 FAKE_10=500
REAL_30=3500 FAKE_30=1500
REAL_50=2500 FAKE_50=2500


In [13]:
print(sv.DetectionDataset(
            classes=fake_ds.classes,
            images={name: fake_ds.images[name] for name in list(fake_ds.images.keys())[:500]},
            annotations={name: fake_ds.annotations[name] for name in list(fake_ds.images.keys())[:500]},
        ))

DetectionDataset(classes=['cell'], images={'datasets\\scc_cell_detection_fake\\train\\images\\sample_00001_38_png.rf.cd93f83a56e890806bb58cd7b9035cd7.jpg': array([[[157, 160, 158],
        [157, 160, 158],
        [156, 159, 157],
        ...,
        [136, 139, 137],
        [146, 149, 147],
        [158, 161, 159]],

       [[152, 155, 153],
        [152, 155, 153],
        [153, 156, 154],
        ...,
        [139, 142, 140],
        [143, 146, 144],
        [148, 151, 149]],

       [[148, 151, 149],
        [148, 151, 149],
        [149, 152, 150],
        ...,
        [143, 146, 144],
        [140, 143, 141],
        [138, 141, 139]],

       ...,

       [[  8,  11,   9],
        [  8,  11,   9],
        [  8,  11,   9],
        ...,
        [  7,  10,   8],
        [  7,  10,   8],
        [  7,  10,   8]],

       [[  8,  11,   9],
        [  8,  11,   9],
        [  8,  11,   9],
        ...,
        [  7,  10,   8],
        [  7,  10,   8],
        [  7,  10,   8]],

      

In [14]:
scc_cell_detection_10 = sv.DetectionDataset.merge([sv.DetectionDataset(
                                                    classes=fake_ds.classes,
                                                    images={name: fake_ds.images[name] for name in list(fake_ds.images.keys())[:FAKE_10]},
                                                    annotations={name: fake_ds.annotations[name] for name in list(fake_ds.images.keys())[:FAKE_10]},
                                                ), 
                                                    sv.DetectionDataset(
                                                    classes=real_train_ds.classes,
                                                    images={name: real_train_ds.images[name] for name in list(real_train_ds.images.keys())[:REAL_10]},
                                                    annotations={name: real_train_ds.annotations[name] for name in list(real_train_ds.images.keys())[:REAL_10]},
                                                )])
scc_cell_detection_30 = sv.DetectionDataset.merge([sv.DetectionDataset(
                                                    classes=fake_ds.classes,
                                                    images={name: fake_ds.images[name] for name in list(fake_ds.images.keys())[:FAKE_30]},
                                                    annotations={name: fake_ds.annotations[name] for name in list(fake_ds.images.keys())[:FAKE_30]},
                                                    ), 
                                                    sv.DetectionDataset(
                                                    classes=real_train_ds.classes,
                                                    images={name: real_train_ds.images[name] for name in list(real_train_ds.images.keys())[:REAL_30]},
                                                    annotations={name: real_train_ds.annotations[name] for name in list(real_train_ds.images.keys())[:REAL_30]},
                                                )])
scc_cell_detection_50 = sv.DetectionDataset.merge([sv.DetectionDataset(
                                                    classes=fake_ds.classes,
                                                    images={name: fake_ds.images[name] for name in list(fake_ds.images.keys())[:FAKE_50]},
                                                    annotations={name: fake_ds.annotations[name] for name in list(fake_ds.images.keys())[:FAKE_50]},
                                                    ), 
                                                    sv.DetectionDataset(
                                                    classes=real_train_ds.classes,
                                                    images={name: real_train_ds.images[name] for name in list(real_train_ds.images.keys())[:REAL_50]},
                                                    annotations={name: real_train_ds.annotations[name] for name in list(real_train_ds.images.keys())[:REAL_50]},
                                                )])

scc_cell_detection_10.as_yolo("datasets/scc_cell_detection_10/train/images", "datasets/scc_cell_detection_10/train/labels", "datasets/scc_cell_detection_10/data.yaml")
scc_cell_detection_30.as_yolo("datasets/scc_cell_detection_30/train/images", "datasets/scc_cell_detection_30/train/labels", "datasets/scc_cell_detection_30/data.yaml")
scc_cell_detection_50.as_yolo("datasets/scc_cell_detection_50/train/images", "datasets/scc_cell_detection_50/train/labels", "datasets/scc_cell_detection_50/data.yaml")