In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zarr
from tqdm import tqdm
import glob, os
import cv2

In [2]:
path = '/media/max1024/Extreme SSD1/Kaggle/czii-cryo-et-object-identification/'

In [3]:
runs = sorted(glob.glob(path + 'train/overlay/ExperimentRuns/*'))
runs = [os.path.basename(x) for x in runs]
i2r_dict = {i:r for i, r in zip(range(len(runs)), runs)}
r2t_dict = {r:i for i, r in zip(range(len(runs)), runs)}
i2r_dict

{0: 'TS_5_4',
 1: 'TS_69_2',
 2: 'TS_6_4',
 3: 'TS_6_6',
 4: 'TS_73_6',
 5: 'TS_86_3',
 6: 'TS_99_9'}

In [4]:
def convert_to_8bit(x):
    lower, upper = np.percentile(x, (0.5, 99.5))
    x = np.clip(x, lower, upper)
    x = (x - x.min()) / (x.max() - x.min() + 1e-12) * 255
    return x.round().astype("uint8")

In [5]:
p2i_dict = {
        'apo-ferritin': 0,
        'beta-amylase': 1,
        'beta-galactosidase': 2,
        'ribosome': 3,
        'thyroglobulin': 4,
        'virus-like-particle': 5
    }

i2p = {v:k for k, v in p2i_dict.items()}

particle_radius = {
        'apo-ferritin': 60,
        'beta-amylase': 65,
        'beta-galactosidase': 90,
        'ribosome': 150,
        'thyroglobulin': 130,
        'virus-like-particle': 135,
    }

In [6]:
particle_names = ['apo-ferritin', 'beta-amylase', 'beta-galactosidase', 'ribosome', 'thyroglobulin', 'virus-like-particle']

In [7]:
def make_annotate_yolo(run_name, is_train_path=True):
    # to split validation
    is_train_path = 'train' if is_train_path else 'val'

    # read a volume
    vol = zarr.open(path + f'train/static/ExperimentRuns/{run_name}/VoxelSpacing10.000/denoised.zarr', mode='r') #bug fixed. Thanks to @pratyushh
    # use largest images
    vol = vol[0]
    # normalize [0, 255]
    vol2 = convert_to_8bit(vol)
    
    n_imgs = vol2.shape[0]
    # process each slices
    for j in range(n_imgs):
        newvol = vol2[j]
        newvolf = np.stack([newvol]*3, axis=-1)
        # YOLO requires image_size is multiple of 32
        newvolf = cv2.resize(newvolf, (640,640))
        # save as 1 slice
        cv2.imwrite(path + f'output/images/{is_train_path}/{run_name}_{j*10}.png', newvolf)
        # make txt file for annotation
        with open(path + f'output/labels/{is_train_path}/{run_name}_{j*10}.txt', 'w'):
            pass # make empty file
            
    # process each paticle types
    for p, particle in enumerate(tqdm(particle_names)):
        # we do not have to detect beta-amylase which weight is 0
        if particle=="beta-amylase":
            continue
        json_each_paticle = path + f"train/overlay/ExperimentRuns/{run_name}/Picks/{particle}.json"
        df = pd.read_json(json_each_paticle) 
        # pick each coordinate of particles
        for axis in "x", "y", "z":
            df[axis] = df.points.apply(lambda x: x["location"][axis])

        
        radius = particle_radius[particle]
        for i, row in df.iterrows():
            # The radius from the center of the particle is used to determine the slices present.
            start_z = np.round(row['z'] - radius).astype(np.int32)
            start_z = max(0, start_z//10) # 10 means pixelspacing
            end_z = np.round(row['z'] + radius).astype(np.int32)
            end_z = min(n_imgs, end_z//10) # 10 means pixelspacing
            
            for j in range(start_z+1, end_z+1-1, 1):
                # white the results of annotation
                with open(path + f'output/labels/{is_train_path}/{run_name}_{j*10}.txt', 'a') as f:
                    f.write(f'{p2i_dict[particle]} {row["x"]/10.012444537618887/vol2.shape[1]} {row["y"]/10.012444196428572/vol2.shape[2]} {radius/10/vol2.shape[1]*2} {radius/10/vol2.shape[2]*2} \n')

In [8]:
os.makedirs(path + "output/images/train", exist_ok=True)
os.makedirs(path + "output/images/val", exist_ok=True)
os.makedirs(path + "output/labels/val", exist_ok=True)
os.makedirs(path + "output/labels/train", exist_ok=True)

In [9]:
# use TS_5_4 as validation
for i, r in enumerate(runs):
    make_annotate_yolo(r, is_train_path=False if i==0 else True)

100%|█████████████████████████████████████████████| 6/6 [00:00<00:00, 61.74it/s]
100%|█████████████████████████████████████████████| 6/6 [00:00<00:00, 59.98it/s]
100%|█████████████████████████████████████████████| 6/6 [00:00<00:00, 45.31it/s]
100%|█████████████████████████████████████████████| 6/6 [00:00<00:00, 64.75it/s]
100%|█████████████████████████████████████████████| 6/6 [00:00<00:00, 46.11it/s]
100%|█████████████████████████████████████████████| 6/6 [00:00<00:00, 40.27it/s]
100%|█████████████████████████████████████████████| 6/6 [00:00<00:00, 43.86it/s]


In [10]:
import shutil
os.makedirs(path + 'output/datasets/czii_det2d', exist_ok=True)
shutil.move(path + 'output/images/train', path + 'output/datasets/czii_det2d/images/train')
shutil.move(path + 'output/images/val', path + 'output/datasets/czii_det2d/images')
shutil.move(path + 'output/labels/train', path + 'output/datasets/czii_det2d/labels/train')
shutil.move(path + 'output/labels/val', path + 'output/datasets/czii_det2d/labels')

'/media/max1024/Extreme SSD1/Kaggle/czii-cryo-et-object-identification/output/datasets/czii_det2d/labels/val'

In [11]:
%%writefile czii_conf.yaml

path: /media/max1024/Extreme SSD1/Kaggle/czii-cryo-et-object-identification/output/datasets/czii_det2d # dataset root dir
train: images/train # train images (relative to 'path') 
val: images/val # val images (relative to 'path') 

# Classes
names:
  0: apo-ferritin
  1: beta-amylase
  2: beta-galactosidase
  3: ribosome
  4: thyroglobulin
  5: virus-like-particle

Overwriting czii_conf.yaml


# References

1. https://www.kaggle.com/code/itsuki9180/czii-making-datasets-for-yolo/notebook