In [None]:
import pandas as pd 
import numpy as np 
import cv2 

def get_RLE_from_mask(mask):
    mask = (mask / 255).astype(int)
    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)


def get_mask_from_RLE(rle, height, width):
    runs = np.array([int(x) for x in rle.split()])
    starts = runs[::2]
    lengths = runs[1::2]

    mask = np.zeros((height * width), dtype=np.uint8)

    for start, length in zip(starts, lengths):
        start -= 1  
        end = start + length
        mask[start:end] = 255

    mask = mask.reshape((height, width))
    
    return mask

def getDenseMask(graph, imagesize = 1024):
    img = np.zeros([imagesize,imagesize])
    graph = graph.reshape(-1, 1, 2).astype('int')
    img = cv2.drawContours(img, [graph], -1, 255, -1)
    return img

input_path = "julia_annotations.xls"
dataframe = pd.read_excel(input_path, header=0, index_col=0)

output = pd.DataFrame(columns=['ImageID', 'Dataset', 'Landmarks', 'Right Lung', 'Left Lung', 'Heart'])

# iter rows
for index, row in dataframe.iterrows():
    # columns are Path, Right lung, Left lung, Heart, Comments
    
    path = str(index)[1:]
    dataset = path.split("/")[0]
    img_name = path.split("/")[1]
    
    name_mapping_file = "ToAnnotate/" + dataset + "_name_mapping.csv"
    
    _name_mapping = pd.read_csv(name_mapping_file)
    original_name = _name_mapping[_name_mapping["New"] == img_name]["Original"].values[0]
    
    if dataset == "MIMIC":
        original_name = original_name.split("/")[-1].replace(".jpg", "")
        dataset = "MIMIC-CXR-JPG"
    elif dataset == "CheXpert":
        original_name = original_name.replace("Datasets/CheXpert/Preprocessed/", "").replace(".png", ".jpg")
    elif dataset == "CANDID":
        original_name = original_name.split("/")[-1]
        dataset = "CANDID-PTX"
    elif dataset == "Padchest":
        original_name = original_name.split("/")[-1]
    elif dataset == "ChestX-Ray8":
        original_name = original_name.split("/")[-1]
    elif dataset == "VinBigData":
        original_name = original_name.split("/")[-1].replace(".png", "")       
        dataset = "VinDr-CXR" 
    
    RL = row["right_lung"]
    LL = row["left_lung"]
    H = row["heart"]
    
    # RL, LL, and H are strings like "[[x1, y1], [x2, y2], ...]"
    # we need to convert them to numpy arrays
    
    RL = np.array(eval(RL)) / 100 * 1024
    RL = np.round(RL, 0).astype(int)
    LL = np.array(eval(LL)) / 100 * 1024
    LL = np.round(LL, 0).astype(int)
    H = np.array(eval(H)) / 100 * 1024
    H = np.round(H, 0).astype(int)
        
    RL_ = getDenseMask(RL)
    LL_ = getDenseMask(LL)
    H_ = getDenseMask(H)
    
    RL_RLE = get_RLE_from_mask(RL_)
    LL_RLE = get_RLE_from_mask(LL_)
    H_RLE = get_RLE_from_mask(H_)
    
    # Sometimes there are more than 44 points for RL, 50 for LL, and 26 for H 
    # due to some LabelStudio issues
    # But it's only one or two in some minor cases, so we can just cut it off or pad it with the last point
    
    if len(RL) > 44:
        RL = RL[:44]
    elif len(RL) < 44:
        RL = np.concatenate([RL, np.ones((44 - len(RL), 2)) * RL[-1]])
    
    if len(LL) > 50:
        LL = LL[:50]
    elif len(LL) < 50:
        LL = np.concatenate([LL, np.ones((50 - len(LL), 2)) * LL[-1]])
    
    if len(H) > 26:
        H = H[:26]
    elif len(H) < 26:
        H = np.concatenate([H, np.ones((26 - len(H), 2)) * H[-1]])
    
    data = np.concatenate([RL, LL, H])
        
    flattened_data = data.flatten()
    coordinates_str = ','.join(map(str, flattened_data))
    
    print("Original name: ", original_name)
    
    id = original_name
    
    new_row = {
        'ImageID': id,
        'Dataset': dataset, 
        'Landmarks': coordinates_str, 
        'Right Lung': RL_RLE, 
        'Left Lung': LL_RLE, 
        'Heart': H_RLE
    }

    output = pd.concat([output, pd.DataFrame([new_row])], ignore_index=True)
    
output.to_csv(input_path.replace(".xls", ".csv"), index=False)