In [None]:
import pandas as pd
import numpy as np
import cv2
from dataclasses import dataclass
import pathlib
from tqdm import tqdm

In [None]:
@dataclass(frozen=True)
class DatasetConfig:
    SEED_VALUE:  int = 41
    CUT_PLANE:   str = "escolha do corte: sagital/coronal"
         
    DATA_ROOT_SOURCE_PATH: str = 'caminho do dataset'
    MAIN_DATA_CSV_PATH: str = DATA_ROOT_SOURCE_PATH + '\\fdg_metadata.csv'
    DATA_ROOT_TARGET:  str = f'caminho de onde será gerados as imagens{CUT_PLANE}' 
    DATA_ROOT_TRAIN:  str = DATA_ROOT_TARGET + '/Train' 
    DATA_ROOT_VALID:  str = DATA_ROOT_TARGET + '/Valid'
    DATA_ROOT_TEST:   str = DATA_ROOT_TARGET + '/Test'
    DATA_TEST_GT:     str = DATA_ROOT_TARGET + '/Test.csv'

In [None]:
# train, val and test split

In [None]:
data_df = pd.read_csv(DatasetConfig.DATA_ROOT_TARGET + "/data_description.csv")
data_df

In [None]:
data_df.diagnosis.value_counts()

In [None]:
col_names = data_df.columns.to_list()
col_names[0] = "imageId"
col_names

In [None]:
data_df.columns = col_names
data_df.head()

In [None]:
# new height will be based on the mean value
#data_df.width.mean()

#novo height ou width baseado no valor máximo
data_df["width"].max()

In [None]:
@dataclass(frozen=True)
class ImageResizeConfig:
    height: int = 400 # VALOR MÁXIMO DE HEIGHT
    width: int = 661 # VALOR MÁXIMO DE WIDTH

In [None]:
def pre_process_and_export_record(row):
    image_path = DatasetConfig.DATA_ROOT_TARGET + row["filePath"]
    img = cv2.imread(image_path)

    original_height, original_width = img.shape[:2] #Atuais dimensões atuais da imagem
    scale_w = ImageResizeConfig.width / original_width
    scale_h = ImageResizeConfig.height / original_height
    scale = min(scale_w, scale_h)
    
    new_width = int(original_width * scale) #novo tamanho
    new_height = int(original_height * scale)
    img_resized = cv2.resize(img, (new_width, new_height)) #redimensionar mantendo a proporção
    
    img_padded = np.zeros((ImageResizeConfig.height, ImageResizeConfig.width, 3), dtype=np.uint8) #bordas pretas
    x_offset = (ImageResizeConfig.width - new_width) // 2 #centralizar imagem
    y_offset = (ImageResizeConfig.height - new_height) // 2
    
    # Colocar a imagem redimensionada no fundo preto
    img_padded[y_offset:y_offset + new_height, x_offset:x_offset + new_width] = img_resized

    img_gray = cv2.cvtColor(img_padded, cv2.COLOR_BGR2GRAY)
    img_normalized = cv2.normalize(img_gray, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)
    
    img_class = "0" if row.diagnosis == "NEGATIVE" else "1"
    
    destination_path = f"{DatasetConfig.DATA_ROOT_TARGET}/{row.subset}/{img_class}"
    pathlib.Path(destination_path).mkdir(parents=True, exist_ok=True)
    destination_path = f"{destination_path}/{row.imageId}.png"
    
    cv2.imwrite(destination_path, img_normalized)

In [None]:
data_df["subset"] = None

In [None]:
data_df.head()

In [None]:
unique_patients = data_df["Subject ID"].unique()
unique_patients

In [None]:
shuffled_patients = pd.Series(unique_patients).sample(frac=1, random_state=DatasetConfig.SEED_VALUE)

In [None]:
train_frac = 0.8
val_frac = 0.1
test_frac = 0.1

In [None]:
train_patients = shuffled_patients[:int(train_frac * len(shuffled_patients))]
val_patients = shuffled_patients[int(train_frac * len(shuffled_patients)):int((train_frac + val_frac) * len(shuffled_patients))]
test_patients = shuffled_patients[int((train_frac + val_frac) * len(shuffled_patients)):]

In [None]:
train_df = data_df[data_df["Subject ID"].isin(train_patients)]
val_df = data_df[data_df["Subject ID"].isin(val_patients)]
test_df = data_df[data_df["Subject ID"].isin(test_patients)]

In [None]:
train_df = train_df.sample(frac=1, random_state=DatasetConfig.SEED_VALUE)
val_df = val_df.sample(frac=1, random_state=DatasetConfig.SEED_VALUE)
test_df = test_df.sample(frac=1, random_state=DatasetConfig.SEED_VALUE)

In [None]:
train_df

In [None]:
val_df

In [None]:
test_df

In [None]:
train_df.diagnosis.value_counts()

In [None]:
val_df.diagnosis.value_counts()

In [None]:
test_df.diagnosis.value_counts()

In [None]:
train_df.subset = "Train"
val_df.subset = "Valid"
test_df.subset = "Test"

In [None]:
#VERIFICAR VAZAMENTO DE DADOS

def check_data_leakage(train_df, val_df, test_df):
    # Obter os IDs dos pacientes (Subject ID) de cada conjunto
    train_patients = set(train_df["Subject ID"].unique())
    val_patients = set(val_df["Subject ID"].unique())
    test_patients = set(test_df["Subject ID"].unique())
    
    # Verificar interseção entre os conjuntos de pacientes
    overlap_train_val = train_patients.intersection(val_patients)
    overlap_train_test = train_patients.intersection(test_patients)
    overlap_val_test = val_patients.intersection(test_patients)
    
    # Listar as interseções encontradas
    if overlap_train_val:
        print(f"Vazamento detectado entre treino e validação: {overlap_train_val}")
    if overlap_train_test:
        print(f"Vazamento detectado entre treino e teste: {overlap_train_test}")
    if overlap_val_test:
        print(f"Vazamento detectado entre validação e teste: {overlap_val_test}")
    
    # Se nenhum overlap for encontrado
    if not (overlap_train_val or overlap_train_test or overlap_val_test):
        print("Nenhum vazamento de dados detectado entre os conjuntos de treino, validação e teste.")

# Chamar a função de verificação antes de executar o pré-processamento
check_data_leakage(train_df, val_df, test_df)

In [None]:
full_data = pd.concat([train_df, val_df, test_df]).sample(frac=1,random_state=DatasetConfig.SEED_VALUE)
full_data

In [None]:
for index, row in tqdm(full_data.iterrows(), total=len(full_data)):
    pre_process_and_export_record(row)

In [None]:
import cv2

# Carregar a imagem
image = cv2.imread("D:\\dataset_maior\\sagital\\l\\Train\\1\\6.png")

# Obter as dimensões
altura, largura, canais = image.shape

print(f"Largura: {largura} pixels")
print(f"Altura: {altura} pixels")
print(f"Número de canais: {canais}")