<a href="https://colab.research.google.com/github/matjesg/deepflash2/blob/master/paper/challenge_data/preprocess_conic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data preprocessing for the CoNIC: Colon Nuclei Identification and Counting Challenge 2022


![Conic Logo](https://rumc-gcorg-p-public.s3.amazonaws.com/b/648/banner.x10.jpeg)


from  https://conic-challenge.grand-challenge.org/:
- The dataset is available at https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/

**References:** Graham, Simon, et al. "Lizard: A Large-Scale Dataset for Colonic Nuclear Instance Segmentation and Classification." Proceedings of the IEEE/CVF International Conference on Computer Vision. 2021.


## 1. Download and extract data

In [None]:
!wget -O images1.zip https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_images1.zip
!wget -O images2.zip https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_images2.zip
!wget -O labels.zip https://warwick.ac.uk/fac/cross_fac/tia/data/lizard/lizard_labels.zip
!mkdir train
!unzip -ju images1.zip -d train/images 
!unzip -juo images2.zip -d train/images
!unzip -oqju labels.zip -d mat_labels

## 2. Imports

In [None]:
# deepflash2 preprocessing required
!pip install -qq git+https://github.com/matjesg/deepflash2.git@master

In [None]:
# Imports
from pathlib import Path
import scipy.io as sio
import numpy as np
import pandas as pd
import imageio
import cv2
from fastprogress import progress_bar
from deepflash2.data import preprocess_mask
import shutil
from fastai.vision.all import *
from sklearn.model_selection import train_test_split

## 3. Convert and save masks

In [None]:
mat_path = Path('mat_labels')
df = pd.read_csv(mat_path/'info.csv')
out_path = Path('train')/'masks'
out_path.mkdir(exist_ok=True)

In [None]:
for i, row in progress_bar(df.iterrows(), total=len(df)):
    
    label = sio.loadmat(mat_path/f'{row.Filename}.mat')
    inst_map = label['inst_map'] 
    nuclei_id = label['id'] # shape (N, 1), where N is the number of nuclei.

    # Load the nuclear categories / classes. 
    classes = label['class']

    unique_values = np.unique(inst_map).tolist()[1:] # remove 0

    # Convert nuclei_id to list.
    nuclei_id = np.squeeze(nuclei_id).tolist()
    clabels = np.zeros_like(inst_map, dtype='uint8')
    for value, classid in zip(unique_values, classes):
        # Get the position of the corresponding value
        clabels[inst_map==value] = classid

    labels = preprocess_mask(clabels=clabels, instlabels=inst_map, remove_connectivity=True, num_classes = classes.max()+1)

    imageio.imsave(out_path/f'{row.Filename}.png', labels)

## 4. Split into train and test set

In [None]:
# Settings
train_path = Path('train')
test_path = Path('test')
image_folder = 'images'
mask_folder = 'masks'
mask_suffix = '.png'

# Functions for copying data
cp_fn = lambda o: test_path/image_folder/p.name
cp_fn_msk = lambda o: test_path/mask_folder/p.name
label_fn = lambda o: train_path/mask_folder/f'{o.stem}{mask_suffix}'

(test_path/image_folder).mkdir(exist_ok=True, parents=True)
(test_path/mask_folder).mkdir(exist_ok=True)

In [None]:
#f_names = get_image_files(train_path/image_folder)
#_, val = train_test_split(f_names, train_size=0.8, shuffle=True, random_state=0)

# Original split was not sorted, so fixing file names here
val = [Path('train/images/consep_10.png'),
 Path('train/images/consep_3.png'),
 Path('train/images/consep_6.png'),
 Path('train/images/consep_9.png'),
 Path('train/images/crag_19.png'),
 Path('train/images/crag_2.png'),
 Path('train/images/crag_22.png'),
 Path('train/images/crag_25.png'),
 Path('train/images/crag_27.png'),
 Path('train/images/crag_30.png'),
 Path('train/images/crag_33.png'),
 Path('train/images/crag_44.png'),
 Path('train/images/crag_48.png'),
 Path('train/images/crag_54.png'),
 Path('train/images/crag_58.png'),
 Path('train/images/crag_64.png'),
 Path('train/images/dpath_11.png'),
 Path('train/images/dpath_12.png'),
 Path('train/images/dpath_17.png'),
 Path('train/images/dpath_21.png'),
 Path('train/images/dpath_22.png'),
 Path('train/images/dpath_24.png'),
 Path('train/images/dpath_31.png'),
 Path('train/images/dpath_33.png'),
 Path('train/images/dpath_34.png'),
 Path('train/images/dpath_40.png'),
 Path('train/images/dpath_48.png'),
 Path('train/images/dpath_52.png'),
 Path('train/images/dpath_56.png'),
 Path('train/images/dpath_65.png'),
 Path('train/images/dpath_8.png'),
 Path('train/images/glas_11.png'),
 Path('train/images/glas_12.png'),
 Path('train/images/glas_16.png'),
 Path('train/images/glas_18.png'),
 Path('train/images/glas_29.png'),
 Path('train/images/glas_32.png'),
 Path('train/images/glas_41.png'),
 Path('train/images/glas_45.png'),
 Path('train/images/glas_46.png'),
 Path('train/images/glas_51.png'),
 Path('train/images/glas_59.png'),
 Path('train/images/glas_60.png'),
 Path('train/images/pannuke_11.png'),
 Path('train/images/pannuke_13.png'),
 Path('train/images/pannuke_15.png'),
 Path('train/images/pannuke_2.png'),
 Path('train/images/pannuke_20.png')]

In [None]:
for p in progress_bar(val):
    shutil.move(str(p), str(cp_fn(p)))
    msk_p = label_fn(p)
    shutil.move(str(msk_p), str(cp_fn_msk(p)))