In [14]:
# import libraries for pre processing
import os
import sys
import random
import math
import re
import time
import numpy as np
import json
import cv2
import matplotlib
import matplotlib.pyplot as plt
import scipy
import warnings
import shutil
import random
from PIL import Image
from pycococreatortools import pycococreatortools
warnings.filterwarnings('ignore')

In [5]:
def oragnize_data(image_path, mask_path, fold, test_split):
    '''
    Organizes the images and masks and splits it into train and test splits
    '''
    index = []
    cell_names = ["neoplastic","inflammatory","softtissue","dead","epithelial"]
    # Loading the data
    images = np.load(image_path + "images.npy", mmap_mode='r+')
    masks = np.load(mask_path + "masks.npy", mmap_mode='r+')

    print('----------Loaded data----------')
    
    # Changing the datatype to reduce the size
    images = images.astype(np.int16)
    masks = masks.astype(np.int16)
    
    print('----------Reduced size----------')

    # Selecting the list of indexes of images with no cells ()
    if fold == 1 and images.shape[0]== 2656:
        index = [584, 586, 604, 748, 750, 780, 811, 812, 813, 828, 830, 832, 833,
                 996, 998, 1147, 1148, 1149, 1152, 1155, 1158, 1160, 1161, 1164,
                 1166, 1432, 1433, 1512, 1578, 1614, 1615, 1616, 1617, 1618, 1619,
                 1620, 1629, 1632, 1704, 1705, 1707, 1708, 1709, 1723, 1724, 1725,
                 1748, 1749, 1750, 1751, 1752, 1753, 1859, 1864, 1870, 1880, 1923,
                 1939, 1940, 1945, 1946, 1966, 1967, 1968, 1969, 1970, 1971, 1972,
                 1973, 1974, 1975, 1976, 1977, 1978, 1979, 2007, 2009, 2019, 2020,
                 2022, 2098, 2108, 2109, 2110, 2111, 2115, 2131, 2132, 2133, 2134,
                 2135, 2137, 2163, 2164, 2165, 2174, 2176, 2202, 2263, 2264, 2265,
                 2267, 2406, 2407, 2462, 2463, 2464, 2465, 2515, 2550, 2551, 2552,
                 2626, 2636, 2639, 2640]   
    if fold  == 2 and images.shape[0]== 2523:
        index = [544, 679, 680, 724, 749, 750, 752, 753, 1028, 1029, 1241, 1248,
                 1249, 1403, 1404, 1434, 1435, 1436, 1440, 1470, 1471, 1472, 1473,
                 1474, 1475, 1476, 1477, 1478, 1524, 1526, 1538, 1539, 1540, 1541,
                 1542, 1543, 1544, 1545, 1546, 1558, 1559, 1560, 1600, 1601, 1607,
                 1651, 1653, 1657, 1660, 1661, 1662, 1665, 1684, 1685, 1686, 1687,
                 1688, 1689, 1690, 1691, 1692, 1693, 1694, 1695, 1696, 1697, 1698,
                 1699, 1700, 1701, 1702, 1738, 1741, 1742, 1743, 1746, 1749, 1845,
                 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1876, 1889, 1921,
                 1928, 1929, 1930, 1931, 1932, 1933, 1936, 1998, 1999, 2000, 2003,
                 2004, 2005, 2019, 2021, 2268, 2269, 2310, 2315, 2397, 2450, 2494,
                 2508, 2511]
    if fold == 3 and images.shape[0]== 2722:
        index = [236, 546, 735, 754, 762, 778, 780, 784, 1016, 1076, 1078, 1079,
                 1083, 1085, 1088, 1307, 1438, 1506, 1507, 1508, 1509, 1510, 1511,
                 1518, 1519, 1523, 1566, 1567, 1568, 1569, 1570, 1571, 1572, 1611,
                 1635, 1636, 1645, 1646, 1647, 1648, 1664, 1665, 1666, 1667, 1668,
                 1669, 1670, 1743, 1757, 1777, 1779, 1780, 1797, 1803, 1804, 1805,
                 1832, 1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840, 1841, 1842,
                 1843, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853,
                 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862, 1894, 1896,
                 1897, 1987, 1988, 1989, 1990, 1991, 1992, 2023, 2025, 2026, 2063,
                 2064, 2065, 2066, 2067, 2074, 2123, 2124, 2138, 2142, 2143, 2373,
                 2375, 2694, 2695]

    print('----------Removed images with no cells----------')

    # Deleting indexes with images which contain no cells
    images = np.delete(images, index, 0)
    masks = np.delete(masks, index, 0)

    indices = list(range(len(images)))
    random.shuffle(indices)
    train_indices = indices[:math.floor(test_split*len(indices))]

    print('----------Splitting indices----------')

    # Organising folders
    if not os.path.isdir('Pannuke_dataset'):
        os.mkdir('Pannuke_dataset/')    
    
    for i, img in enumerate(images):
        if i in train_indices:
            phase = "train"
        else:
            phase = "val"
        im = Image.fromarray(img.astype(np.uint8))
        if not os.path.isdir('Pannuke_dataset/'+phase+'/image_'+str(i)+'/images/'):
            os.makedirs('Pannuke_dataset/'+phase+'/image_'+str(i)+'/images/')
        im.save('Pannuke_dataset/'+phase+'/image_'+str(i)+'/images/image_'+str(i)+'.jpg')
        for k in range(masks[i].shape[2]-1):
            cell_mask = masks[i][:,:,k]
            indiviudal_masks = []
            unique_colors = np.unique(cell_mask)
            if len(unique_colors) > 1:
                for l in range(len(unique_colors)-1):
                    ms = Image.fromarray(cell_mask == unique_colors[l+1])
                    if not os.path.isdir('Pannuke_dataset/'+phase+'/image_'+str(i)+'/masks/'+cell_names[k]):
                        os.makedirs('Pannuke_dataset/'+phase+'/image_'+str(i)+'/masks/'+cell_names[k])
                    ms.save('Pannuke_dataset/'+phase+'/image_'+str(i)+'/masks/'+cell_names[k]+'/masks_'+str(l)+'.jpg')
    print('----------Finished organising----------')


In [7]:
image_path = './Fold 1/images/fold1/'
mask_path = './Fold 1/masks/fold1/'
fold = 1
oragnize_data(image_path, mask_path, fold,0.8)

----------Loaded data----------
----------Reduced size----------
----------Removed images with no cells----------
----------Splitting indices----------
----------Finished organising----------


In [8]:
# loop to save images in one folder as desired by detextron 2
def final_directory_structure(input_path,output_path):
    print(input_path)
    images = os.listdir(input_path)
    if not os.path.isdir(output_path):
        os.makedirs(output_path)
    for img in images:
        shutil.copyfile(input_path+img+'/images/'+img+'.jpg', output_path+img+'.jpg')

In [10]:
train_input_path = "./Pannuke_dataset/train/"
train_output_path = './final_pannuke_dataset/train/'
final_directory_structure(train_input_path,train_output_path)

./Pannuke_dataset/train/


In [11]:
val_input_path = "./Pannuke_dataset/val/"
val_output_path = './final_pannuke_dataset/val/'
final_directory_structure(val_input_path,val_output_path)

./Pannuke_dataset/val/


In [12]:
def pannuke_to_coco_format(image_path, output_path, categories = ["neoplastic","inflammatory","softtissue","dead","epithelial"] , dataset_name = "pannuke"):
    '''
    this function converts the pannuke dataset format to the coco format which makes it easier to apply detectron 
    2 algorithms on.
    '''
    images_name = os.listdir(image_path)
    cocoformat = {"licenses":[], "info":[], "images":[], "annotations":[], "categories":[]}
    for i in range(len(categories)):
        cocoformat["categories"].append({"id": int(i+1), "name": categories[i], "supercategory": dataset_name})
    m_id = 1
    for i, img in enumerate(images_name):
        image = Image.open(image_path + img + "/images/" + img + ".jpg")
        image_info = pycococreatortools.create_image_info(
                int(i+1), img + ".jpg" , image.size)
        cocoformat["images"].append(image_info)
        c_types = os.listdir(image_path + img + "/masks/")
        for c in c_types:
            masks = os.listdir(image_path + img + "/masks/"+c)
            for msk in masks:
                category_info = {'id': int(categories.index(c)+1), 'is_crowd': False}
                m_image = np.asarray(Image.open(image_path + img + "/masks/"+c+"/"+ msk).convert('1')).astype(np.uint8)
                annotation_info = pycococreatortools.create_annotation_info(
                        m_id, int(i+1), category_info, m_image,
                        image.size, tolerance=2)
                m_id = m_id + 1
                if annotation_info is not None:
                    cocoformat["annotations"].append(annotation_info) 
    with open(output_path, "w") as f:
        json.dump(cocoformat, f)

In [15]:
train_path = './Pannuke_dataset/train/'
train_output_path = './final_pannuke_dataset/pannuke_train.json'
pannuke_to_coco_format(image_path = train_path, output_path = train_output_path, categories = ["neoplastic","inflammatory","softtissue","dead","epithelial"] , dataset_name = "pannuke_train")

In [16]:
val_path = './Pannuke_dataset/val/'
val_output_path = './final_pannuke_dataset/pannuke_val.json'
pannuke_to_coco_format(image_path = val_path, output_path = val_output_path, categories = ["neoplastic","inflammatory","softtissue","dead","epithelial"] , dataset_name = "pannuke_val")