# Validation of the results with statistical measurements!

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import glob
import cv2
import scipy.io
from scipy import ndimage as ndi
from scipy.optimize import linear_sum_assignment
from utils.metrics import *

# Conventional Metrics

First we will calculate the most conventional segmentation goodness measures: DICE1, Jaccard index (IoU), sensitivity and specificity. Specificity can be maximized by using the tresholded post-processing results. These measures are pixel-wise, NOT measures for number of correct nuclei instances detected. Those are calculated after these.

### Now let's calculate the metrics for pixel labels

In [2]:
# Specify folder where the ground truth annotations and predicted annotations come from

# ground truth masks
gt_masks = sorted(glob.glob('/home/leos/IMAGE_ANALYSIS/datasets/H&E/nuclei_segmentation/exhaustive_annotations/Nucleisegmentation-Kumar/test/Labels/*.mat'))

# predictions
model_preds = sorted(glob.glob('/home/leos/IMAGE_ANALYSIS/model_ouputs/Kumar/result_segmentations/V1/unet_smp_seresnet50_kumar/random_walker/Labels/*.mat'))

# random walker
OUTDIR = "/home/leos/IMAGE_ANALYSIS/model_ouputs/Kumar/result_segmentations/V1/unet_smp_seresnet50_kumar/random_walker/Metrics/"

In [7]:
print(len(files))
print(len(gt_masks))
print(len(model_preds))

14
14
14


In [8]:
dices = []
jis = []
recalls = [] # sensitivities
specificities = []

idxs = [f.split("/")[-1] for f in model_preds]
score_df = pd.DataFrame(columns=["DICE", "Jaccard", "Sensitivity", "Specificity"], index=idxs)

# Loop through the predictions and ground truths
for gt, mask in zip(gt_masks, model_preds):
    gt_objects = scipy.io.loadmat(gt)
    gt_objects = gt_objects['inst_map']
    pred_objects = scipy.io.loadmat(mask)
    pred_objects = pred_objects['inst_map']
    
    # Calculate the metrics
    dice, ji, recall, specificity = conventional_metrics(gt_objects, pred_objects)
    score_df.loc[mask.split("/")[-1]] = {'DICE': dice, 'Jaccard': ji, 'Sensitivity':recall, 'Specificity': specificity}
    
    dices.append(dice)
    jis.append(ji)
    recalls.append(recall)
    specificities.append(specificity)

score_df.loc['averages for the test set'] = {
    'DICE': np.sum(dices)/len(dices), 
    'Jaccard': np.sum(jis)/len(jis), 
    'Sensitivity':np.sum(recalls)/len(recalls), 
    'Specificity': np.sum(specificities)/len(specificities)
}

# Show the results
fn = OUTDIR + 'binary_metrics.csv'
score_df.to_csv(fn, index=True)
score_df

Unnamed: 0,DICE,Jaccard,Sensitivity,Specificity
TCGA-2Z-A9J9-01A-01-TS1_result_mask.mat,0.779637,0.638857,0.758916,0.954992
TCGA-44-2665-01B-06-BS6_result_mask.mat,0.814812,0.687496,0.72799,0.980535
TCGA-69-7764-01A-01-TS1_result_mask.mat,0.753314,0.604254,0.658756,0.978004
TCGA-A6-6782-01A-01-BS1_result_mask.mat,0.772911,0.629873,0.712115,0.977507
TCGA-AC-A2FO-01A-01-TS1_result_mask.mat,0.747545,0.596863,0.688779,0.953484
TCGA-AO-A0J2-01A-01-BSA_result_mask.mat,0.785074,0.64619,0.860253,0.915904
TCGA-CU-A0YN-01A-02-BSB_result_mask.mat,0.808261,0.67822,0.868831,0.925853
TCGA-EJ-A46H-01A-03-TSC_result_mask.mat,0.799185,0.665536,0.744373,0.979084
TCGA-FG-A4MU-01B-01-TS1_result_mask.mat,0.839864,0.723936,0.806183,0.970785
TCGA-GL-6846-01A-01-BS1_result_mask.mat,0.764101,0.618256,0.649,0.988584


# More Sophisticated Metrics

These measures take into account the unique nuclei instances pairing with their corresponding predictions and they don't just count the metrics as whole binary masks superimposed on top of each other like the traditional methods do.
First we define the metrics in python and in the end we calculate:
- Aggregated Jaccard index introduced here: Kumar, Neeraj, Ruchika Verma, Sanuj Sharma, Surabhi Bhargava, Abhishek Vahadane, and Amit Sethi. "A dataset and a technique for generalized nuclear segmentation for computational pathology." IEEE transactions on medical imaging 36, no. 7 (2017): 1550-1560.
- Aggregated Jaccard index plus introduced in: Graham, S., Vu, Q. D., Raza, S. E. A., Azam, A., Tsang, Y. W., Kwak, J. T., & Rajpoot, N. (2019). Hover-Net: Simultaneous segmentation and classification of nuclei in multi-tissue histology images. Medical Image Analysis, 58, 101563. 
- Ensemble DICE (DICE2) introduced here: Vu, Quoc Dang, Simon Graham, Minh Nguyen Nhat To, Muhammad Shaban, Talha Qaiser, Navid Alemi Koohbanani, Syed Ali Khurram et al. "Methods for Segmentation and Classification of Digital Microscopy Tissue Images." arXiv preprint arXiv:1810.13230 (2018).
- Panoptic quality (PQ), introduced here: Kirillov, Alexander, Kaiming He, Ross Girshick, Carsten Rother, and Piotr Dollár. "Panoptic Segmentation." arXiv preprint arXiv:1801.00868 (2018). Good blog post about this here: https://medium.com/@danielmechea/panoptic-segmentation-the-panoptic-quality-metric-d69a6c3ace30


Check out metrics.py if interested in the implemetations done by TIA-lab people

### Calculate all the above metrics

In [9]:
#########################################################
# Count scores for each file
ajis = []
aji_ps = []
dice2s = []
pqs = []
sqs = []
dqs = []
sensitivities = [] # instance sensitivities
precisions = []
idxs = [f.split("/")[-1] for f in model_preds]
score_df = pd.DataFrame(columns=["AJI", "AJI plus", "DICE2", "PQ", "SQ", "DQ", "inst Sensitivity", "inst Precision"], index=idxs)

# Loop through the predictions and ground truths
for gt, mask in zip(gt_masks, model_preds):

    true = scipy.io.loadmat(gt)
    true = true['inst_map']
    pred = scipy.io.loadmat(mask)
    pred = pred['inst_map']
        
    # Calculate the metrics
    pq = PQ(true, pred)
    aji = AJI(true, pred)
    aji_p = AJI_plus(true, pred)
    dice2 = DICE2(true, pred)
    
    
    score_df.loc[mask.split("/")[-1]] = {
        'AJI': aji, 
        'AJI plus': aji_p, 
        "DICE2": dice2, 
        "PQ": pq['pq'], # panoptic quality
        "SQ": pq['sq'], # segmentation quality
        "DQ": pq['dq'], # Detection quality i.e. F1-score
        "inst Sensitivity": pq['sensitivity'], # Sensitivity in detecting matching nucleis
        "inst Precision": pq['precision']  # Specificity in detecting matching nucleis
    }
    ajis.append(aji)
    aji_ps.append(aji_p)
    dice2s.append(dice2)
    pqs.append(pq['pq'])
    sqs.append(pq['sq'])
    dqs.append(pq['dq'])
    sensitivities.append(pq['sensitivity'])
    precisions.append(pq['precision'])
    
score_df.loc['averages for the test set'] = { 
    'AJI': np.sum(ajis)/len(ajis),
    'AJI plus': np.sum(aji_ps)/len(aji_ps),
    'DICE2': np.sum(dice2s)/len(dice2s),
    'PQ': np.sum(pqs)/len(pqs),
    'SQ': np.sum(sqs)/len(sqs),
    'DQ': np.sum(dqs)/len(dqs),
    "inst Sensitivity":np.sum(sensitivities)/len(sensitivities),
    "inst Precision":np.sum(precisions)/len(precisions)
}

# Show the results
fn = OUTDIR + 'aggregated_instance_metrics.csv'
score_df.to_csv(fn, index=True)
score_df

Unnamed: 0,AJI,AJI plus,DICE2,PQ,SQ,DQ,inst Sensitivity,inst Precision
TCGA-2Z-A9J9-01A-01-TS1_result_mask.mat,0.493746,0.511642,0.669189,0.448725,0.702037,0.639175,0.593043,0.693089
TCGA-44-2665-01B-06-BS6_result_mask.mat,0.482821,0.505283,0.65793,0.492137,0.732015,0.672304,0.58313,0.793677
TCGA-69-7764-01A-01-TS1_result_mask.mat,0.531523,0.537783,0.722975,0.515479,0.705307,0.730858,0.714286,0.748219
TCGA-A6-6782-01A-01-BS1_result_mask.mat,0.545789,0.563542,0.747287,0.532912,0.732755,0.727273,0.671202,0.793566
TCGA-AC-A2FO-01A-01-TS1_result_mask.mat,0.479448,0.499288,0.663689,0.46066,0.69099,0.666667,0.631478,0.706009
TCGA-AO-A0J2-01A-01-BSA_result_mask.mat,0.427662,0.492819,0.601521,0.44577,0.725999,0.614009,0.578652,0.653968
TCGA-CU-A0YN-01A-02-BSB_result_mask.mat,0.514656,0.552553,0.699263,0.535827,0.746455,0.717829,0.6839,0.755302
TCGA-EJ-A46H-01A-03-TSC_result_mask.mat,0.60309,0.610504,0.77287,0.573052,0.719156,0.79684,0.757511,0.840476
TCGA-FG-A4MU-01B-01-TS1_result_mask.mat,0.548095,0.598411,0.708287,0.563049,0.735763,0.765258,0.701075,0.842377
TCGA-GL-6846-01A-01-BS1_result_mask.mat,0.557603,0.561102,0.737058,0.525784,0.724048,0.726172,0.676056,0.784314
