In [1]:
import os, sys
sys.path.insert(0, "\\".join(os.path.abspath(os.curdir).split("\\")[:-2:]))

from src.utils.utils import *

### Pegar Metadados em csv e dicom - CBIS-DDSM

In [2]:
def get_study_metadata(paths: dict, metadata_csv: object, indice: int) -> dict:
    # Pega metadados no dataframe
    dic_meta_in_csv = {}  
    
    for col in metadata_csv.iloc[::, :11:].columns.tolist():
        if isinstance(metadata_csv[col][indice], np.integer):
            dic_meta_in_csv[col] = int(metadata_csv[col][indice])
        elif pd.isna(metadata_csv[col][indice]):
            dic_meta_in_csv[col] = "NaN"
        else:
            dic_meta_in_csv[col] = metadata_csv[col][indice]
    
    # Pega metadados no objeto dicom
    metadata_dicom_files = {"original": [], "cropped": []}
    
    for key, path in paths.items():
        path_exam_files = f"D:/mathe/Documents/BancoDados_IC/Mamografia/CBIS-DDSM/{path}/"
        directory = Path(path_exam_files)
        paths_dicom_images = list(directory.rglob("*.dcm*"))
        
        #Adicionar path da imagem no metadados do csv
        dic_meta_in_csv[f'{key}_image_path'] = str(paths_dicom_images[0]).replace("\\", "/").split("/")
        dic_meta_in_csv[f'{key}_image_path'].pop() # Retira o nome do arquivo Dicom do Path
        dic_meta_in_csv[f'{key}_image_path'] = "/".join(dic_meta_in_csv[f'{key}_image_path'])
        
        for path_dicom in paths_dicom_images:
            dicom_file = dcmread(path_dicom)
            metadata_dicom_files[key].append(get_dicom_meta(dicom_file, False))
    
    return study_factory(paths['original'], dic_meta_in_csv, metadata_dicom_files)

### Salvar metadados arquivos Json - CBIS-DDSM

In [3]:
metadata_files = ['calc_case_description_test_set',
             'mass_case_description_train_set',
             'calc_case_description_train_set',
             'mass_case_description_test_set']

images_size_cbis_ddsm = []
studies = []

for current_meta in metadata_files:
    df = pd.read_csv(f"../../data/mamografia/CBIS_DDSM/{current_meta}.csv")
    
    for i in range(df.shape[0]):
        original_image_path = df.iloc[i][11]
        cropped_images_path = df.iloc[i][12]
        
        original_image_path = preprocessing_path(original_image_path)
        cropped_images_path = preprocessing_path(cropped_images_path)
        
        metadata = df.iloc[::, :11:]
        all_image_path = {"original": original_image_path, "cropped":cropped_images_path}
        
        path = f"D:/mathe/Documents/BancoDados_IC/Mamografia/CBIS-DDSM"
        path = f"{path}/{all_image_path['original']}/"
        
        tmp = get_images_size(path, "dcm", True)
        for img_size in tmp:
            images_size_cbis_ddsm.append(img_size)
        
        studies.append(get_study_metadata(all_image_path, metadata, i))
        
save_json("extracted_metadata_CBIS-DDSM", studies, f"../../outputs/mamografia/extracted_metadata")

images_size_cbis_ddsm = pd.DataFrame({'size_mb': images_size_cbis_ddsm, 'database': 'CBIS-DDSM'})

### Pegar Metadados em csv e dicom - CMMD

In [5]:
path_metadata = f"../../data/mamografia/CMMD/CMMD_clinicaldata_revision.xlsx"
path_cmmd = path = f"D:/mathe/Documents/BancoDados_IC/Mamografia/CMMD"

df = pd.read_excel(path_metadata)
df = df.fillna("NaN")

studies = []
images_size_cmmd = []

left_or_right_breast = {'L': 'Left', 'R': 'Right'}
abnormality = {'calcification': 'Calc', 'mass': 'Mass', 'both': 'Both'}

for i in range(df.shape[0]):
    # Pegar o nome do Estudo
    study_name = [abnormality[df.iloc[i:i+1:, 4:5:].values[0][0]],
                  df.iloc[i:i+1:, 0:1:].values[0][0],
                  left_or_right_breast[df.iloc[i:i+1:, 1:2:].values[0][0]]]
    study_name = "_".join(study_name)
    
    # Pegar metadados no csv
    metadata_csv = {}
    for column in df.columns:
        if isinstance(df.loc[i, column], np.integer):
            metadata_csv[column] = int(df.loc[i, column])
        elif pd.isna(df.loc[i, column]):
            metadata_csv[column] = "NaN"
        else:
            metadata_csv[column] = df.loc[i, column]
     
    # Pegar paths dos arquivo dicom   
    path_exam_files = path_cmmd + f"/{df.loc[i, 'ID1']}/"
    directory = Path(path_exam_files)
    paths_dicom_images = list(directory.rglob("*.dcm*"))
    
    tmp = get_images_size(path_exam_files, "dcm", True)
    
    for img_size in tmp:
        images_size_cmmd.append(img_size)
    
    metadata_csv['image_path'] = list(map(str, paths_dicom_images))
    metadata_csv['image_path'] = [path.replace("\\", "/") for path in metadata_csv['image_path']]
    
    # Pegar metadados dos arquivos dicom
    metadata_dicom_files = []
    
    for path_dicom in paths_dicom_images:
        dicom_file = dcmread(path_dicom)
        metadata_dicom_files.append(get_dicom_meta(dicom_file, False))
        
    studies.append(study_factory(study_name, metadata_csv, metadata_dicom_files))
  
save_json("extracted_metadata_CMMD", studies, f"../../outputs/mamografia/extracted_metadata")
images_size_cmmd = pd.DataFrame({'size_mb': images_size_cmmd, 'database': 'CMMD'})

### Pegar Metadados em txt e pgm - MIAS

In [6]:
atributes_list_txt = ['reference_number', 'background_tissue', 'abnormality', 'classification',
                  'x_centre_abnormality', 'y_centre_abnormality',
                  'radius']    
classification_dict = {'B': "Benign", "M": "Malignant"}
abnormality_dict = {'CALC': 'Calcification', 'CIRC': 'Well-defined/circumscribed masses',
                    'SPIC': 'Spiculated masses', 'MISC': 'Other, ill-defined masses',
                    'ARCH': 'Architectural distortion', 'ASYM': 'Asymmetry', 'NORM': 'Normal'}
tissue_dict = {'F': 'Fatty', 'G': 'Fatty-glandular', 'D': 'Dense-glandular'} 
     
list_metadata = []

path = f"D:/mathe/Documents/BancoDados_IC/Mamografia/MIAS/all-mias"
cropped_images_path = f"D:/mathe/Documents/BancoDados_IC/Mamografia/MIAS/all-roi"
images_size_mias = get_images_size(path, "pgm", True)

with open("../../data/mamografia/MIAS/raw_metadata_mias.txt", "r") as file:
    raw_metadata = file.readlines()
    del raw_metadata[0]
    
for current_meta in raw_metadata:
    tmp_list = current_meta.split(" ")
    if len(tmp_list) == 4:
        del tmp_list[3]
    elif len(tmp_list) == 7:
        tmp_list[6] = tmp_list[6].replace("\n", "")
        
    tmp_dict = {}
    image_path = f"{path}/{tmp_list[0]}.pgm"
    
    # Pegar Metadados no TXT
    if len(tmp_list) > 3:
        study_name = f"{tmp_list[2]}_{tmp_list[0]}_{classification_dict[tmp_list[3]]}"
    else:
        study_name = f"{tmp_list[2]}_{tmp_list[0]}"
    tmp_dict['study_name'] = study_name
    
    metadata_txt = {}
    
    for i in range(len(tmp_list)):
        if i == 1:
            metadata_txt[atributes_list_txt[i]] = tissue_dict[tmp_list[i]]
        elif i == 2:
            metadata_txt[atributes_list_txt[i]] = abnormality_dict[tmp_list[i]]
        elif i == 3:
            metadata_txt[atributes_list_txt[i]] = classification_dict[tmp_list[i]]
        else:
            metadata_txt[atributes_list_txt[i]] = tmp_list[i]
    
    metadata_txt['image_path'] = image_path     
    tmp_dict['metadata_txt'] = metadata_txt
    # Fim Pegar Metadados no TXT
    
    # Pegar Metadados no arquivo PGM
    metadata_pgm_file = {}
    _, header = read_pgm(image_path)
    header = header.split("\n")
    
    metadata_pgm_file['header'] = header[0]
    
    row, columns = header[1].split(" ")
    metadata_pgm_file['rows'] = row
    metadata_pgm_file['columns'] = row
    
    metadata_pgm_file['max_value'] = header[2]
    
    tmp_dict['metadata_pgm_file'] = metadata_pgm_file
    
    # Gerar região de interesse (ROI)
    if 'x_centre_abnormality' in metadata_txt.keys() and \
       'y_centre_abnormality' in metadata_txt.keys() and \
       'radius' in metadata_txt.keys():
           
        img = cv2.imread(image_path)
        x = int(metadata_txt['x_centre_abnormality'])
        y = int(metadata_txt['y_centre_abnormality'])
        radius = int(metadata_txt['radius'])
        
        # Crie uma máscara circular vazia
        mascara = np.zeros(img.shape[:2], dtype=np.uint8)

        # Desenhe a circunferência na máscara
        cv2.circle(mascara, (x, y), radius, 255, -1)

        # Aplique a máscara à imagem original para obter a ROI circular
        roi_circular = cv2.bitwise_and(img, img, mask=mascara)
        
        if f"{tmp_list[0]}.png" not in os.listdir(cropped_images_path):
            cv2.imwrite(f"{cropped_images_path}/{tmp_list[0]}.png", roi_circular)
        tmp_dict['metadata_txt']['cropped_image_path'] = f"{cropped_images_path}/{tmp_list[0]}.png"
    
    list_metadata.append(tmp_dict)
    
save_json("extracted_metadata_MIAS", list_metadata, "../../outputs/mamografia/extracted_metadata")
images_size_mias = pd.DataFrame({'size_mb': images_size_mias, 'database': 'MIAS'})

### Pegar Metadados em csv e dicom - InBreast

Tratar os metadados no csv e xls

In [7]:
csv_file = pd.read_csv("../../data/mamografia/INBREAST/INbreast.csv", sep=";")
xls_file = pd.read_excel("../../data/mamografia/INBREAST/INbreast.xls")

csv_file['File Name'] = csv_file['File Name'].astype(str)

xls_file = xls_file[['File Name', 'Findings Notes (in Portuguese)']]
xls_file['File Name'] = xls_file['File Name'].fillna(0).apply(int).astype(str)

metadata_csv = csv_file.merge(xls_file, how='inner', on='File Name')
metadata_csv.columns = metadata_csv.columns.str.lower().str.replace(" ", "_")
metadata_csv.rename(columns={'findings_notes_(in_portuguese)': 'findings_notes'}, inplace=True)
metadata_csv['acquisition_date'] = metadata_csv['acquisition_date'].apply(str)

Salvar os metadados csv e dicom

In [8]:
path_data = "D:\mathe\Documents\BancoDados_IC\Mamografia\INBREAST\AllDICOMs"
cropped_images_path = "D:\mathe\Documents\BancoDados_IC\Mamografia\INBREAST\AllROI"
dicom_files_names = [file for file in os.listdir(path_data) if ".dcm" in file]

studies = []
images_size_inbreast = []

for i in range(metadata_csv.shape[0]):
    # Pegar metadados csv
    meta_csv = {}
    for column in metadata_csv.columns[::]:
        meta_csv[column] = metadata_csv.loc[i, column]
            
    # Pegar metadados Dicom
    dict_meta_dcm = None
    for dicom_file_name in dicom_files_names:
        if metadata_csv.loc[i, 'file_name'] in dicom_file_name:
            image_path = path_data + f"\\{dicom_file_name}"
            meta_csv['image_path'] = image_path.replace("\\", "/")
        
            images_size_inbreast.append(get_images_size(image_path, "dcm", False))
            
            dicom_file = dcmread(image_path)
            dict_meta_dcm = get_dicom_meta(dicom_file, False)
    
    # Pré processar alguns campos
    meta_csv['laterality'] = "Right" if meta_csv['laterality'] == 'R' else "Left"
    meta_csv['findings_notes'] = meta_csv['findings_notes'].replace("ó", "o")
    meta_csv['cropped_image_path'] = (cropped_images_path + f"/{meta_csv['file_name']}.roi").replace("\\", "/")
    
    # Salvar no estudo
    study = {}
    part_1 = meta_csv['findings_notes'].split()[0]
    study['study_name'] = f"{part_1}_{i}_{meta_csv['laterality']}"
    
    study['metadata_csv'] = meta_csv
    study['metadata_dicom_files'] = dict_meta_dcm
    
    studies.append(study)
    
save_json("extracted_metadata_INBREAST", studies, "../../outputs/mamografia/extracted_metadata")
images_size_inbreast = pd.DataFrame({'size_mb': images_size_inbreast, 'database': 'INBREAST'})

In [9]:
images_size = pd.concat([images_size_cmmd, images_size_cbis_ddsm, images_size_mias, images_size_inbreast], axis=0)
images_size.to_csv("../../outputs/mamografia/images_size.csv")