In [10]:
import pandas as pd 
import numpy as np 
import cv2 

def get_RLE_from_mask(mask):
    mask = (mask / 255).astype(int)
    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

def get_mask_from_RLE(rle, height, width):
    runs = np.array([int(x) for x in rle.split()])
    starts = runs[::2]
    lengths = runs[1::2]

    mask = np.zeros((height * width), dtype=np.uint8)

    for start, length in zip(starts, lengths):
        start -= 1  
        end = start + length
        mask[start:end] = 255

    mask = mask.reshape((height, width))
    
    return mask

def getDenseMask(graph, imagesize = 1024):
    img = np.zeros([imagesize,imagesize])
    graph = graph.reshape(-1, 1, 2).astype('int')
    img = cv2.drawContours(img, [graph], -1, 255, -1)
    return img

input_path = "martina_annotations_set-findings.xls"
dataframe = pd.read_excel(input_path, header=0, index_col=0)

output = pd.DataFrame(columns=['ImageID', 'Dataset', 'Landmarks', 'Right Lung', 'Left Lung', 'Heart'])

# iter rows
for index, row in dataframe.iterrows():
    # columns are Path, Right lung, Left lung, Heart, Comments
    
    path = str(index)[1:]
        
    try:
        dataset = path.split("/")[0]
        disease = path.split("/")[1]
        img_name = path.split("/")[2]
    except:
        dataset = "ChestX-Ray8"
        img_name = path
        disease = ""
    
    if dataset == "MIMIC-CXR-JPG":
        original_name = img_name.split("_")[-1].replace(".jpg", "")
    elif dataset == "Padchest":
        original_name = img_name
    elif dataset == "ChestX-Ray8":
        original_name = img_name
    elif dataset == "CANDID-PTX":
        original_name = img_name
    elif dataset == "VinDr-CXR":
        original_name = img_name[:-4]
    elif dataset == "CheXpert":
        original_name = img_name.replace("_", "/").replace("/fr", "_fr").replace(".png",".jpg")
        
    RL = row["right_lung"]
    LL = row["left_lung"]
    H = row["heart"]
    
    # RL, LL, and H are strings like "[[x1, y1], [x2, y2], ...]"
    # we need to convert them to numpy arrays
    
    RL = np.array(eval(RL)) / 100 * 1024
    RL = np.round(RL, 0).astype(int)
    LL = np.array(eval(LL)) / 100 * 1024
    LL = np.round(LL, 0).astype(int)
    H = np.array(eval(H)) / 100 * 1024
    H = np.round(H, 0).astype(int)
        
    RL_ = getDenseMask(RL)
    LL_ = getDenseMask(LL)
    H_ = getDenseMask(H)
    
    RL_RLE = get_RLE_from_mask(RL_)
    LL_RLE = get_RLE_from_mask(LL_)
    H_RLE = get_RLE_from_mask(H_)
    
    # Sometimes there are more than 44 points for RL, 50 for LL, and 26 for H 
    # due to some LabelStudio issues
    # But it's only one or two in some minor cases, so we can just cut it off or pad it with the last point
    
    if len(RL) > 44:
        RL = RL[:44]
    elif len(RL) < 44:
        RL = np.concatenate([RL, np.ones((44 - len(RL), 2)) * RL[-1]])
    
    if len(LL) > 50:
        LL = LL[:50]
    elif len(LL) < 50:
        LL = np.concatenate([LL, np.ones((50 - len(LL), 2)) * LL[-1]])
    
    if len(H) > 26:
        H = H[:26]
    elif len(H) < 26:
        H = np.concatenate([H, np.ones((26 - len(H), 2)) * H[-1]])
    
    data = np.concatenate([RL, LL, H])
        
    flattened_data = data.flatten()
    coordinates_str = ','.join(map(str, flattened_data))
    
    print("Original name: ", original_name)
    
    id = original_name
    
    new_row = {
        'ImageID': id,
        'Dataset': dataset, 
        'Landmarks': coordinates_str, 
        'Right Lung': RL_RLE, 
        'Left Lung': LL_RLE, 
        'Heart': H_RLE
    }

    output = pd.concat([output, pd.DataFrame([new_row])], ignore_index=True)
    
output.to_csv(input_path.replace(".xls", ".csv"), index=False)

Original name:  0e39840b-24e7c4c5-b05e2bf9-f6aab11a-dc323629
Original name:  c030283d-b5cd5262-a59288c9-c1a4ad57-b4fbca10
Original name:  c3d68978-16adbf15-8549fc2d-640e2b44-823afeb9
Original name:  0d208c41-8d0bb2b0-aeda94a9-e8bf1872-fbe28a24
Original name:  a1c37055-a0367e02-917befd4-fe67e568-3cb650a5
Original name:  f894f8f9-b349a21f-6b7c8164-af30f758-4ac73289
Original name:  f8b6e835-89d234dd-a8de9f18-5a723f58-279d4cc8
Original name:  66049acc-ee1e6b29-1298f53b-32d453b3-41377f10
Original name:  2a1f9d96-ef385d23-58689072-0a84ce9c-0ae5a38e
Original name:  b8d59b2a-be55b95c-770d7295-b8f3b006-af21dee7
Original name:  9f0d230b-b83bb5af-e2b73a21-9e5b6357-82d0e148
Original name:  b9f0582f-f6a3d0c2-ab526960-547920ed-d4ff898d
Original name:  76f38ea0-13104a68-b7fccfd8-c2876f1e-05eb3817
Original name:  47715176-d3c203b0-e49bac9f-187685ad-481553c1
Original name:  160aeb25-0f25a53d-0feb594b-1ae2e150-e7acd609
Original name:  2577b081-c65a3a97-870ec2a8-96658cc1-4aa4b2a6
Original name:  b204bd07

In [6]:
df1 = pd.read_csv(input_path.replace(".xls", ".csv"))
df2 = pd.read_csv("julia_annotations.csv")

df3 = pd.concat([df1, df2], ignore_index=True)
df3.to_csv("julia_annotations_merge.csv", index=False)

In [7]:
import pandas as pd 

physician_annotations = "julia_annotations_merge.csv"

df = pd.read_csv(physician_annotations)

datasets = df["Dataset"].unique()
dataset_path = "../../../Annotations/Preprocessed/"

for dataset in datasets:
    
    data = pd.read_csv(dataset_path + dataset + ".csv")    
    column_name = data.columns[0]
    
    subset_annotations = df[df["Dataset"] == dataset]
    subset_dataset = data[data[column_name].isin(subset_annotations["ImageID"])]
    
    # save both to Subsets folder
    
    subset_annotations.to_csv("DataSubsets/" + dataset + "_annotations.csv", index = False)
    subset_dataset.to_csv("DataSubsets/" + dataset + ".csv", index = False)
    
    print("Saved: ", dataset)
    print("Annotations: ", len(subset_annotations))
    print("Dataset: ", len(subset_dataset))
    
    del subset_annotations
    del subset_dataset
    del data
    

Saved:  MIMIC-CXR-JPG
Annotations:  49
Dataset:  49
Saved:  Padchest
Annotations:  50
Dataset:  44
Saved:  ChestX-Ray8
Annotations:  50
Dataset:  50
Saved:  CANDID-PTX
Annotations:  50
Dataset:  50
Saved:  VinDr-CXR
Annotations:  50
Dataset:  50
Saved:  CheXpert
Annotations:  56
Dataset:  56


In [12]:
df1 = pd.read_csv(input_path.replace(".xls", ".csv"))
df2 = pd.read_csv("martina_annotations.csv")

df3 = pd.concat([df1, df2], ignore_index=True)
df3.to_csv("martina_annotations_merge.csv", index=False)

In [13]:
import pandas as pd 

physician_annotations = "martina_annotations_merge.csv"

df = pd.read_csv(physician_annotations)

datasets = df["Dataset"].unique()
dataset_path = "../../../Annotations/Preprocessed/"

for dataset in datasets:
    subset_annotations = df[df["Dataset"] == dataset]
    
    # save both to Subsets folder
    
    subset_annotations.to_csv("DataSubsets/" + dataset + "_martina_annotations.csv", index = False)
    
    print("Saved: ", dataset)
    print("Annotations: ", len(subset_annotations))
    
    del subset_annotations

Saved:  MIMIC-CXR-JPG
Annotations:  49
Saved:  Padchest
Annotations:  50
Saved:  ChestX-Ray8
Annotations:  50
Saved:  CANDID-PTX
Annotations:  50
Saved:  VinDr-CXR
Annotations:  50
Saved:  CheXpert
Annotations:  56
