    # Annotation Comparison Script

    This Jupyter Notebook contains a script that compares annotations made by three annotators: Nadiya, Rachelle, and Mark. It computes the Cohen's Kappa coefficient for each pair of annotators to measure inter-rater agreement. The script also identifies overlapping annotations between the annotators based on the Intersection over Union (IoU) metric.

    ## Execution    
Upon executing the script, it reads all the .geojson files in the specified target folder. The files are separated based on the annotator's names as suffixes. The script then computes the overlapping annotations between the annotators and calculates the Cohen's Kappa coefficients for each pair.

Step 1: Extract and Harmonize Annotations from Three Annotators
This step involves extracting annotations in the form of GeoJSON files from three annotators, harmonizing the class names based on a predefined mapping, and then saving the harmonized annotations in a new directory.

On execution, the script reads the GeoJSON files from the directories of the three annotators. It harmonizes the annotations based on the predefined mapping and saves the harmonized GeoJSON files in a new directory. This ensures that the subsequent steps of analysis have a consistent set of annotations from all annotators.

Step 2: From the folder off step 1 it calculates the cohen's kappa by creating bounding boxes of which the indices are taken from a IoU of 0.5


In [48]:
# extract double geosjon files from 3 annotators and save them in a new folder
import os
import json
from shapely.geometry import shape

class_mapping = {
    'macrophage': 'macrophage',
    'Macrophage': 'macrophage',
    'epithelium': 'epithelium',
    'Epithelium': 'epithelium',
    'neutrophil': 'neutrophil',
    'vascular endothelium': 'endothelium',
    'Vascular endothelium': 'endothelium',
    'melanophage': 'melanophage',
    'Melanophage': 'melanophage',
    'plasma cell': 'plasma cell',
    'Plasma cell': 'plasma cell',
    'lymphocytes': 'lymphocyte',
    'lymphocyte': 'lymphocyte',
    'tumor': 'tumor',
    'Tumor': 'tumor',
    'stroma': 'stroma',
    'Stroma': 'stroma',
    'eosinophil': 'eosinophil',
    'Eosinophil': 'eosinophil',
    'Immune cells': 'lymphocyte',
    'Connective': 'connective',
    'discuss': 'discuss',
    'discuss_willeke': 'discuss',
    'Discuss': 'discuss',
    'Necrosis': 'necrosis',
    'Other': 'other'
}


def iou(poly1, poly2):
    """Calculate Intersection Over Union for two polygons."""
    intersection = poly1.intersection(poly2).area
    union = poly1.union(poly2).area
    return intersection / union

def update_and_save_geojsons(folder1, folder2, folder3, suffix1, suffix2, suffix3):
    classes = set()
    target_folder = '/mnt/d/TIL_Melanoma_train_database/cell_segmentation/TUE/manual_segmented_tiles/all'
    
    # Make sure target folder exists
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    # Get all GeoJSON filenames from all folders
    files1 = set([f for f in os.listdir(folder1) if f.endswith('.geojson')])
    files2 = set([f for f in os.listdir(folder2) if f.endswith('.geojson')])
    files3 = set([f for f in os.listdir(folder3) if f.endswith('.geojson')])

    # Only include GeoJSONs that are present in all the folders
    common_files = files1.intersection(files2).intersection(files3)

    # Process and save GeoJSONs for each folder
    for folder, suffix in [(folder1, suffix1), (folder2, suffix2), (folder3, suffix3)]:
        for filename in common_files:
            with open(os.path.join(folder, filename), 'r') as file:
                geojson = json.load(file)

            # Update class names
            for annot in geojson['features']:
                if 'classification' in annot['properties']:
                    old_class = annot['properties']['classification']['name']
                    new_class = class_mapping.get(old_class, 'other')
                    annot['properties']['classification']['name'] = new_class
                    classes.add(new_class)

            # Save the updated GeoJSON to the target folder
            new_filename = filename.split('.')[0] + suffix + '.geojson'
            with open(os.path.join(target_folder, new_filename), 'w') as file:
                json.dump(geojson, file)

    return classes

# Example usage:
nadiya = '/mnt/d/TIL_Melanoma_train_database/cell_segmentation/TUE/manual_segmented_tiles/nadiya'
rachelle = '/mnt/d/TIL_Melanoma_train_database/cell_segmentation/TUE/manual_segmented_tiles/rachelle'
mark = '/mnt/d/TIL_Melanoma_train_database/cell_segmentation/TUE/manual_segmented_tiles/mark'
update_and_save_geojsons(nadiya, rachelle, mark, '_nadiya', '_rachelle', '_mark')

{'connective',
 'discuss',
 'endothelium',
 'eosinophil',
 'epithelium',
 'lymphocyte',
 'macrophage',
 'melanophage',
 'necrosis',
 'neutrophil',
 'other',
 'plasma cell',
 'stroma',
 'tumor'}

In [49]:
# calculate cohen's kappa
import os
import json
from shapely.geometry import shape
from tqdm import tqdm
from detectron2.structures import BoxMode, pairwise_iou, Boxes
from sklearn.metrics import cohen_kappa_score
import torch

def polygon_to_bbox(polygon):
    """Convert Shapely polygon to a Detectron2 style bounding box [x1, y1, x2, y2]."""
    minx, miny, maxx, maxy = polygon.bounds
    return [minx, miny, maxx, maxy]

def get_overlapping_annotations(annotations1, annotations2):
    boxes1 = [BoxMode.convert(polygon_to_bbox(shape(annot['geometry'])), BoxMode.XYXY_ABS, BoxMode.XYXY_ABS) for annot in annotations1]
    boxes2 = [BoxMode.convert(polygon_to_bbox(shape(annot['geometry'])), BoxMode.XYXY_ABS, BoxMode.XYXY_ABS) for annot in annotations2]
    
    boxes1 = Boxes(torch.tensor(boxes1))
    boxes2 = Boxes(torch.tensor(boxes2))

    iou_matrix = pairwise_iou(boxes1, boxes2)
    overlap_indices = torch.nonzero(iou_matrix > 0.5, as_tuple=True)
    return overlap_indices

def compare_and_calculate_kappa(target_folder):
    all_files = os.listdir(target_folder)
    files_nadiya = [f for f in all_files if "_nadiya.geojson" in f]
    files_rachelle = [f for f in all_files if "_rachelle.geojson" in f]
    files_mark = [f for f in all_files if "_mark.geojson" in f]

    base_names_nadiya = {f.replace('_nadiya.geojson', '') for f in files_nadiya}
    base_names_rachelle = {f.replace('_rachelle.geojson', '') for f in files_rachelle}
    base_names_mark = {f.replace('_mark.geojson', '') for f in files_mark}
    common_base_names = base_names_nadiya.intersection(base_names_rachelle).intersection(base_names_mark)
    
    # Separate label lists for each comparison
    labels_nadiya_nr = []
    labels_rachelle_nr = []
    labels_mark_nr = []

    labels_nadiya_nm = []
    labels_rachelle_nm = []
    labels_mark_nm = []

    labels_nadiya_rm = []
    labels_rachelle_rm = []
    labels_mark_rm = []

    for base_name in tqdm(common_base_names):
        file_nadiya = os.path.join(target_folder, base_name + "_nadiya.geojson")
        file_rachelle = os.path.join(target_folder, base_name + "_rachelle.geojson")
        file_mark = os.path.join(target_folder, base_name + "_mark.geojson")

        with open(file_nadiya, 'r') as file:
            annotations_nadiya = json.load(file)['features']
        with open(file_rachelle, 'r') as file:
            annotations_rachelle = json.load(file)['features']
        with open(file_mark, 'r') as file:
            annotations_mark = json.load(file)['features']

        overlap_indices_nr = get_overlapping_annotations(annotations_nadiya, annotations_rachelle)
        overlap_indices_nm = get_overlapping_annotations(annotations_nadiya, annotations_mark)
        overlap_indices_rm = get_overlapping_annotations(annotations_rachelle, annotations_mark)

        # Nadiya vs Rachelle
        for idx1, idx2 in zip(*overlap_indices_nr):
            class1 = annotations_nadiya[idx1]['properties'].get('classification', {}).get('name', 'NoClass')
            class2 = annotations_rachelle[idx2]['properties'].get('classification', {}).get('name', 'NoClass')
            labels_nadiya_nr.append(class1)
            labels_rachelle_nr.append(class2)
            labels_mark_nr.append('NoMatch_NR')

        # Nadiya vs Mark
        for idx1, idx2 in zip(*overlap_indices_nm):
            class1 = annotations_nadiya[idx1]['properties'].get('classification', {}).get('name', 'NoClass')
            class2 = annotations_mark[idx2]['properties'].get('classification', {}).get('name', 'NoClass')
            labels_nadiya_nm.append(class1)
            labels_rachelle_nm.append('NoMatch_NM')
            labels_mark_nm.append(class2)

        # Rachelle vs Mark
        for idx1, idx2 in zip(*overlap_indices_rm):
            class1 = annotations_rachelle[idx1]['properties'].get('classification', {}).get('name', 'NoClass')
            class2 = annotations_mark[idx2]['properties'].get('classification', {}).get('name', 'NoClass')
            labels_nadiya_rm.append('NoMatch_RM')
            labels_rachelle_rm.append(class1)
            labels_mark_rm.append(class2)

    # Compute Cohen's Kappa for each pair
    kappa_nr = cohen_kappa_score(labels_nadiya_nr, labels_rachelle_nr)
    kappa_nm = cohen_kappa_score(labels_nadiya_nm, labels_mark_nm)
    kappa_rm = cohen_kappa_score(labels_rachelle_rm, labels_mark_rm)



    return kappa_nr, kappa_nm, kappa_rm

# Example Usage
kappa_value_nr, kappa_value_nm, kappa_value_rm = compare_and_calculate_kappa('/mnt/d/TIL_Melanoma_train_database/cell_segmentation/TUE/manual_segmented_tiles/all')
print(f"Cohen's Kappa (Nadiya vs. Rachelle): {kappa_value_nr}")
print(f"Cohen's Kappa (Nadiya vs. Mark): {kappa_value_nm}")
print(f"Cohen's Kappa (Rachelle vs. Mark): {kappa_value_rm}")


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:03<00:00,  3.14it/s]

Cohen's Kappa (Nadiya vs. Rachelle): 0.4440220585402995
Cohen's Kappa (Nadiya vs. Mark): 0.595776701978337
Cohen's Kappa (Rachelle vs. Mark): 0.5450879503667092





In [50]:
# view all classes in a folder of geojsons
import os
import json

def extract_classes_from_geojson(filepath):
    """Extracts classes from a given GeoJSON file."""
    with open(filepath, 'r') as file:
        geojson = json.load(file)
    classes = set()
    for feature in geojson['features']:
        if 'classification' in feature['properties']:
            classes.add(feature['properties']['classification']['name'])
    return classes

def extract_unique_classes_from_folders(*folders):
    """Extracts all unique classes from GeoJSONs across multiple folders."""
    unique_classes = set()
    for folder in folders:
        for filename in os.listdir(folder):
            if filename.endswith('.geojson'):
                file_path = os.path.join(folder, filename)
                classes_in_file = extract_classes_from_geojson(file_path)
                unique_classes.update(classes_in_file)
    return unique_classes

# Example usage:
nadiya = '/mnt/d/TIL_Melanoma_train_database/cell_segmentation/TUE/manual_segmented_tiles/nadiya'
rachelle = '/mnt/d/TIL_Melanoma_train_database/cell_segmentation/TUE/manual_segmented_tiles/rachelle'
mark = '/mnt/d/TIL_Melanoma_train_database/cell_segmentation/TUE/manual_segmented_tiles/mark'

all_unique_classes = extract_unique_classes_from_folders(nadiya, rachelle, mark)
print(all_unique_classes)


{'Plasma cell', 'discuss', 'Eosinophil', 'lymphocytes', 'neutrophil', 'discuss_willeke', 'Melanophage', 'Connective', 'Tumor', 'Other', 'Vascular endothelium', 'Macrophage', 'Necrosis', 'melanophage', 'eosinophil', 'vascular endothelium', 'Epithelium', 'endothelium', 'Discuss', 'lymphocyte', 'Stroma', 'epithelium', 'plasma cell', 'macrophage', 'Immune cells'}
