In [1]:
import os
import sys
import pandas as pd
import cv2

sys.path.insert(0, "\\".join(os.path.abspath(os.curdir).split("\\")[:-2:]))
from src.utils.utils import *  # noqa: E402, F403

### CBIS-DDSM

#### Variáveis e funções

In [None]:
path = 'D:/mathe/Documents/banco_dados_ic/mamografia/cbis-ddsm'

mass_case_path = ['mass_case_description_train_set.csv',
                  'mass_case_description_test_set.csv']
calc_case_path = ['calc_case_description_train_set.csv',
                  'calc_case_description_test_set.csv']

mass_case_columns_rename = {'patient_id':'id', 'assessment': 'bi-rads', 'image_file_path':'image_path',
                            'cropped_image_file_path': 'cropped_image_path', 'roi_mask_file_path': 'roi_image_path'}

calc_case_columns_rename = {'patient_id':'id', 'assessment': 'bi-rads', 'calc_type': 'calcification_type',
                            'calc_distribution': 'calcification_distribution', 'image_file_path':'image_path',
                            'cropped_image_file_path': 'cropped_image_path', 'roi_mask_file_path': 'roi_image_path'}

def update_dicom_paths(df):
    # Atualizar o caminho das imagens dicom do dataframe
    for id, image_path, cropped_image_path in zip(df.index, df['image_path'], df['cropped_image_path']):
        # Atualizar o caminho das imagens dicom
        folder_path = image_path.split('/')[0]
        directory = Path(f'{path}/all-dicom/{folder_path}')
        path_dicom_image = str(list(directory.rglob("*.dcm*"))[0])
        df.loc[id, 'image_path'] = path_dicom_image

        # Atualizar o caminho das imagens dicom cropped e roi
        cropped_roi_folder_path = cropped_image_path.split('/')[0]
        directory = Path(f'{path}/all-dicom/{cropped_roi_folder_path}')
        path_dicom_images = list(directory.rglob("*.dcm*"))

        tmp = dcmread(path_dicom_images[0]).pixel_array

        if tmp.shape[0] < 900:
            df.loc[id, 'roi_image_path'] = path_dicom_images[1]
            df.loc[id, 'cropped_image_path'] = path_dicom_images[0]
        else:
            df.loc[id, 'roi_image_path'] = path_dicom_images[0]
            df.loc[id, 'cropped_image_path'] = path_dicom_images[1]

#### Extração de metadados

In [None]:
# Concatena os dataframes de massas e calcificações
mass_case_df = pd.DataFrame()
calc_case_df = pd.DataFrame()

for mass_path in mass_case_path:
    df = pd.read_csv(f'{path}/{mass_path}')
    mass_case_df = pd.concat([mass_case_df, df], ignore_index=True)

for calc_path in calc_case_path:
    df = pd.read_csv(f'{path}/{calc_path}')
    calc_case_df = pd.concat([calc_case_df, df], ignore_index=True)

# Renomeia as colunas
mass_case_df.columns = mass_case_df.columns.str.lower().str.replace(' ', '_')
mass_case_df.rename(columns=mass_case_columns_rename, inplace=True)

calc_case_df.columns = calc_case_df.columns.str.lower().str.replace(' ', '_')
calc_case_df.rename(columns=calc_case_columns_rename, inplace=True)

# Converte os valores de patologia para minúsculo
mass_case_df['pathology'] = mass_case_df['pathology'].str.lower()
calc_case_df['pathology'] = calc_case_df['pathology'].str.lower()

# Converte os valores de image_view para minúsculo
mass_case_df['image_view'] = mass_case_df['image_view'].str.lower()
calc_case_df['image_view'] = calc_case_df['image_view'].str.lower()

# Atualizar o caminho das imagens dicom do dataframe de massas e calcificações
update_dicom_paths(mass_case_df)
update_dicom_paths(calc_case_df)

# Extrair imagem, cropped e roi do dicom
mass_case_json = []
calc_case_json = []

for df, case_json in zip([mass_case_df, calc_case_df], [mass_case_json, calc_case_json]):
    for id, image_path, cropped_image_path, roi_image_path in zip(df['id'],
                                                                df['image_path'],
                                                                df['cropped_image_path'],
                                                                df['roi_image_path']):
        # Ler o dicom
        dicom_file = dcmread(image_path)

        # Extrai os metadados do dicom
        case_json.append({'id': id, 'image_path': image_path} | get_dicom_meta(dicom_file))


mass_case_df.to_csv('../../outputs/mamografia/cbis-ddsm/metadata_mass_case_cbis-ddsm.csv', index=False)
calc_case_df.to_csv('../../outputs/mamografia/cbis-ddsm/metadata_calc_case_cbis-ddsm.csv', index=False)
save_json('../../outputs/mamografia/cbis-ddsm/metadata_mass_case_cbis-ddsm.json', mass_case_json)
save_json('../../outputs/mamografia/cbis-ddsm/metadata_calc_case_cbis-ddsm.json', calc_case_json)

### CMMD

In [None]:
path = f"D:/mathe/Documents/banco_dados_ic/mamografia/cmmd"
columns_rename = {'ID1': 'id', 'LeftRight': 'left_or_right_breast', 'Age': 'patient_age',
                  'abnormality': 'abnormality_type', 'classification': 'pathology'}

left_or_right_breast_dict = {'L': 'left', 'R': 'right'}

# Carrega os dados de mamografia
df = pd.read_excel(f"{path}/cmmd_clinical_data_revision.xlsx")

# Renomeia as colunas
df.rename(columns=columns_rename, inplace=True)

# Renomeia os valores da coluna left_or_right_breast
df['left_or_right_breast'] = df['left_or_right_breast'].map(left_or_right_breast_dict)

# Converte a coluna pathology para letras minúsculas
df['pathology'] = df['pathology'].str.lower()

case_json = []

new_df = pd.DataFrame()
for index, row in df.iterrows():
    directory = Path(f"{path}/all-dicom/{row['id']}")
    dicom_paths = list(directory.rglob("*.dcm*"))

    for image_path in dicom_paths:
        # Adiciona uma linha para cada imagem de uma mamografia
        new_line = row.to_dict()
        new_line['image_path'] = image_path
        new_df = pd.concat([new_df, pd.DataFrame([new_line])], ignore_index=True)

        # Ler o dicom
        dicom_file = dcmread(image_path)

        # Extrai os metadados do dicom
        case_json.append({'id': new_line['id'], 'image_path': str(image_path).replace('\\', '/')}
                         | get_dicom_meta(dicom_file))

# Converte
new_df.to_csv('../../outputs/mamografia/cmmd/metadata_cmmd.csv', index=False)
save_json('../../outputs/mamografia/cmmd/metadata_cmmd.json', case_json)

### MIAS

In [5]:
path = f"D:/mathe/Documents/banco_dados_ic/mamografia/mias"
columns_rename_dict = {'REFNUM': 'id', 'BG': 'background_tissue', 'CLASS': 'abnormality_type',
                       'SEVERITY': 'pathology', 'X': 'x_center_abnormality', 'Y':
                       'y_center_abnormality', 'RADIUS': 'radius'}
pathology_dict = {'B': "benign", "M": "malignant"}
abnormality_dict = {'CALC': 'Calcification', 'CIRC': 'Well-defined/circumscribed masses',
                    'SPIC': 'Spiculated masses', 'MISC': 'Other, ill-defined masses',
                    'ARCH': 'Architectural distortion', 'ASYM': 'Asymmetry', 'NORM': 'Normal'
}
background_tissue_dict = {'F': 'Fatty', 'G': 'Fatty-glandular', 'D': 'Dense-glandular'}

# Carregar DF
df_mias = pd.read_csv(f'{path}/Info.txt', sep=' ').drop('Unnamed: 7', axis=1)

# Renomear colunas, anomalias, patologia e tecido do fundo
df_mias.rename(columns=columns_rename_dict, inplace=True)
df_mias['abnormality_type'] = df_mias['abnormality_type'].map(abnormality_dict)
df_mias['pathology'] = df_mias['pathology'].map(pathology_dict)
df_mias['background_tissue'] = df_mias['background_tissue'].map(background_tissue_dict)

# Criar coluna image_path
df_mias['image_path'] = path + '/all-mias/' + df_mias['id'] + '.pgm'

cropped_path = []
roi_path = []
for image_path, id, x, y, radius in zip(df_mias['image_path'], df_mias['id'],
                                    df_mias['x_center_abnormality'], df_mias['y_center_abnormality'],
                                    df_mias['radius']):

    if pd.isna(x) and pd.isna(y) and pd.isna(radius):
        cropped_path.append(np.NaN)
        roi_path.append(np.NaN)
        continue

    radius = int(radius)
    x, y = int(x), (1024 - int(y))

    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Crie uma máscara circular
    mask = np.zeros_like(image)
    cv2.circle(mask, (x, y), radius, 255, -1)

    # Aplique a máscara à imagem
    roi_image = cv2.bitwise_and(image, mask)
    roi_image[roi_image != 0] = 255

    # Salve a ROI
    cv2.imwrite(f"{path}/all-roi/{id}_roi.png", roi_image)

    # Salve o caminho da ROI
    roi_path.append(f"{path}/all-roi/{id}_roi.png")

    # Recorte a CROPPED da imagem original
    cropped_image = None
    x_start, y_start = x - 2 * radius, y - 2 * radius
    x_end, y_end = x_start + 4 * radius, y_start + 4 * radius

    arr_tmps = []
    if x_start < 0:
        arr_tmps.append(0)
        x_start = 0
    if x_end > image.shape[1]:
        arr_tmps.append(1)
        x_end = 1024
    if y_start < 0:
        arr_tmps.append(2)
        y_start = 0
    if y_end > image.shape[0]:
        arr_tmps.append(3)
        y_end = 1024

    cropped_image = image[y_start:y_end, x_start:x_end]

    if 0 in arr_tmps:
        tmp = np.zeros((cropped_image.shape[0], abs((x - 2 * radius) - 0)))
        cropped_image = np.hstack((tmp, cropped_image))
    if 1 in arr_tmps:
        tmp = np.zeros((cropped_image.shape[0], (x + 2 * radius) - 1024))
        cropped_image = np.hstack((cropped_image, tmp))
    if 2 in arr_tmps:
        tmp = np.zeros((abs((y - 2 * radius) - 0), cropped_image.shape[1]))
        cropped_image = np.vstack((tmp, cropped_image))
    if 3 in arr_tmps:
        tmp = np.zeros(((y + 2 * radius) - 1024, cropped_image.shape[1]))
        cropped_image = np.vstack((cropped_image, tmp))

    # Salve a CROPPED
    cv2.imwrite(f"{path}/all-roi/{id}_cropped.png", cropped_image)

    # Salve o caminho da CROPPED
    cropped_path.append(f"{path}/all-roi/{id}_cropped.png")

# Criar coluna cropped_image_path
df_mias['cropped_image_path'] = cropped_path
df_mias['roi_image_path'] = roi_path
df_mias['pathology'].fillna('normal', inplace=True)
df_mias.to_csv('../../outputs/mamografia/mias/metadata_mias.csv', index=False)
df_mias.head()

Unnamed: 0,id,background_tissue,abnormality_type,pathology,x_center_abnormality,y_center_abnormality,radius,image_path,cropped_image_path,roi_image_path
0,mdb001,Fatty-glandular,Well-defined/circumscribed masses,benign,535.0,425.0,197.0,D:/mathe/Documents/banco_dados_ic/mamografia/m...,D:/mathe/Documents/banco_dados_ic/mamografia/m...,D:/mathe/Documents/banco_dados_ic/mamografia/m...
1,mdb002,Fatty-glandular,Well-defined/circumscribed masses,benign,522.0,280.0,69.0,D:/mathe/Documents/banco_dados_ic/mamografia/m...,D:/mathe/Documents/banco_dados_ic/mamografia/m...,D:/mathe/Documents/banco_dados_ic/mamografia/m...
2,mdb003,Dense-glandular,Normal,normal,,,,D:/mathe/Documents/banco_dados_ic/mamografia/m...,,
3,mdb004,Dense-glandular,Normal,normal,,,,D:/mathe/Documents/banco_dados_ic/mamografia/m...,,
4,mdb005,Fatty,Well-defined/circumscribed masses,benign,477.0,133.0,30.0,D:/mathe/Documents/banco_dados_ic/mamografia/m...,D:/mathe/Documents/banco_dados_ic/mamografia/m...,D:/mathe/Documents/banco_dados_ic/mamografia/m...


### INBREAST

In [None]:
path = f"D:/mathe/Documents/banco_dados_ic/mamografia/inbreast"
columns_rename = { 'laterality': 'left_or_right_breast', 'view': 'image_view', 'file_name': 'id', 'mass_': 'is-mass',
                  'micros': 'is-micros', 'distortion': 'is-distortion', 'asymmetry': 'is-asymmetry',
                  'findings_notes_(in_portuguese)': 'findings_notes'}

left_or_right_breast_dict = {'L': 'left', 'R': 'right'}
# Carrega os dados de mamografia
df = pd.read_excel(f"{path}/INbreast.xls")

# Dropa as colunas Patient ID, Patient Age,
df.drop(columns=["Patient ID", "Patient age", "Lesion Annotation Status", "Other Notes"], axis=1, inplace=True)

# Renomeia as colunas
df.columns = df.columns.str.replace(" ", "_").str.lower()
df.rename(columns=columns_rename, inplace=True)

# Trata os valores da coluna left_or_right_breast
df["left_or_right_breast"] = df["left_or_right_breast"].map(left_or_right_breast_dict)

# Troca a ordem das colunas
columns_order = df.columns.to_list()
columns_order.remove("id")
columns_order.insert(0, "id")
df = df[columns_order]

# Tratar os valores nulos das colunas is-mass, is-micros, is-distortion, is-asymmetry
columns_nan = ["is-mass", "is-micros", "is-distortion", "is-asymmetry"]
df[columns_nan] = df[columns_nan].replace({np.nan: 'No', 'X': 'Yes'})

# Tratar os valores nulos das colunas pectoral_muscle_annotation e other_annotations
columns_nan = ['pectoral_muscle_annotation', 'other_annotations']
df[columns_nan] = df[columns_nan].replace({np.nan: 'No annotation'})

directory = Path(f"{path}/all-dicom")
dicom_image_paths = list(directory.rglob("*.dcm"))

directory = Path(f"{path}/all-xml")
roi_image_paths = list(directory.rglob("*xml"))

image_paths = []
roi_paths = []
case_json = []

for index, row in df.iterrows():
    # Encontra o caminho do dicom e do xml correspondente a linha atual
    image_path = [path for path in dicom_image_paths if str(row["id"]) in str(path)][0]
    image_path = str(image_path).replace('\\', '/')

    roi_image_path = [path for path in roi_image_paths if str(row["id"]) in str(path)]
    roi_image_path = np.nan if roi_image_path == [] else str(roi_image_path[0]).replace('\\', '/')

    # Adiciona os caminhos a lista
    image_paths.append(str(image_path).replace('\\', '/'))
    roi_paths.append(roi_image_path)

    # Ler o dicom
    dicom_file = dcmread(image_path)

    # Extrai os metadados do dicom
    case_json.append({'id': row['id'], 'image_path': image_path}
                     | get_dicom_meta(dicom_file))

df['image_path'] = image_paths
df['roi_image_path'] = roi_paths

df.to_csv('../../outputs/mamografia/inbreast/metadata_inbreast.csv', index=False)
save_json('../../outputs/mamografia/inbreast/metadata_inbreast.json', case_json)

df.head()