In [1]:
!pip install pydicom pillow
!apt-get install -y libgdcm-tools
!pip install pylibjpeg pylibjpeg-libjpeg pylibjpeg-openjpeg
!pip install dicomsdl

Collecting pydicom
  Using cached pydicom-3.0.1-py3-none-any.whl.metadata (9.4 kB)
Collecting pillow
  Using cached pillow-11.0.0-cp311-cp311-win_amd64.whl.metadata (9.3 kB)
Using cached pydicom-3.0.1-py3-none-any.whl (2.4 MB)
Using cached pillow-11.0.0-cp311-cp311-win_amd64.whl (2.6 MB)
Installing collected packages: pydicom, pillow
Successfully installed pillow-11.0.0 pydicom-3.0.1


"apt-get" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.


Collecting pylibjpeg
  Downloading pylibjpeg-2.0.1-py3-none-any.whl.metadata (7.8 kB)
Collecting pylibjpeg-libjpeg
  Downloading pylibjpeg_libjpeg-2.3.0-cp311-cp311-win_amd64.whl.metadata (4.8 kB)
Collecting pylibjpeg-openjpeg
  Downloading pylibjpeg_openjpeg-2.4.0-cp311-cp311-win_amd64.whl.metadata (5.7 kB)
Collecting numpy (from pylibjpeg)
  Using cached numpy-2.1.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
Downloading pylibjpeg-2.0.1-py3-none-any.whl (24 kB)
Downloading pylibjpeg_libjpeg-2.3.0-cp311-cp311-win_amd64.whl (744 kB)
   ---------------------------------------- 0.0/744.1 kB ? eta -:--:--
   -------------- ------------------------- 262.1/744.1 kB ? eta -:--:--
   ---------------------------------------- 744.1/744.1 kB 2.6 MB/s eta 0:00:00
Downloading pylibjpeg_openjpeg-2.4.0-cp311-cp311-win_amd64.whl (238 kB)
Using cached numpy-2.1.3-cp311-cp311-win_amd64.whl (12.9 MB)
Installing collected packages: numpy, pylibjpeg-openjpeg, pylibjpeg-libjpeg, pylibjpeg
Successfully insta

In [None]:
import numpy as np
import os
from tqdm import tqdm  # Importar la versión de tqdm para notebooks
from pathlib import Path
import dicomsdl
import multiprocessing as mp
from PIL import Image
import pandas as pd


In [None]:

RESIZE_TO = (512, 512)

# Crear directorio de trabajo
!rm -rf train_images_processed_cv2_dicomsdl_{RESIZE_TO[0]}
!mkdir train_images_processed_cv2_dicomsdl_{RESIZE_TO[0]}

def dicom_file_to_ary(path):
    dcm_file = dicomsdl.open(str(path))
    data = dcm_file.pixelData()

    # Convertir explícitamente a un array de NumPy y verificar el tipo
    data = np.array(data, dtype=np.float32)

    if data.size == 0 or not isinstance(data, np.ndarray):
        raise ValueError(f"La imagen en {path} no contiene datos válidos para procesar o no es un array de NumPy.")

    # Verificar si el array es continuo en memoria
    if not data.flags['C_CONTIGUOUS']:
        data = np.ascontiguousarray(data)

    # Asegurarse de que data sea 2D
    if len(data.shape) != 2:
        raise ValueError(f"La imagen en {path} no es 2D y no se puede redimensionar con PIL. Forma actual: {data.shape}")

    # Normalizar y convertir a uint8 antes de redimensionar
    data = (data - data.min()) / (data.max() - data.min())
    data = (data * 255).astype(np.uint8)

    # Usar PIL para redimensionar
    try:
        image = Image.fromarray(data)
        image = image.resize(RESIZE_TO, Image.LANCZOS)
        data_resized = np.array(image, dtype=np.uint8)  # Convertir explícitamente a uint8
        data_resized = np.ascontiguousarray(data_resized)

    except Exception as e:
        print(f"Error al redimensionar la imagen en {path} con PIL: {e}")
        raise

    return data_resized

def process_directory(directory_path):
    parent_directory = str(directory_path).split('/')[-1]
    output_dir = f'train_images_processed_cv2_dicomsdl_{RESIZE_TO[0]}/{parent_directory}'
    os.makedirs(output_dir, exist_ok=True)
    
    image_paths = list(directory_path.iterdir())
    
    with tqdm(total=len(image_paths), desc=f'Procesando carpeta {parent_directory}', leave=False) as pbar:
        for image_path in image_paths:
            processed_ary = dicom_file_to_ary(image_path)
            
            # Guardar la imagen usando PIL
            image_to_save = Image.fromarray(processed_ary)
            image_to_save.save(f'{output_dir}/{image_path.stem}.png')
            
            pbar.update(1)



In [None]:
directories = list(Path('/kaggle/input/rsna-breast-cancer-detection/train_images').iterdir())

In [None]:
# Usar tqdm para monitorear el progreso general del procesamiento de directorios
with mp.Pool(mp.cpu_count()) as pool:
    for _ in tqdm(pool.imap_unordered(process_directory, directories), total=len(directories), desc="Progreso general de carpetas"):
        pass

Completamos el df con la ubicación de cada imagen png

In [4]:
train_df = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/train.csv')

test_df = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/test.csv')


base_path='/kaggle/input/imagenes-png/kaggle/working'

# saving image path into train dataframe
train_df['img_path']= f'{base_path}/train_images_processed_cv2_dicomsdl_512'\
                    + '/' + train_df.patient_id.astype(str)\
                    + '/' + train_df.image_id.astype(str)\
                    + '.png'



display(train_df.head(3))

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,img_path
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False,/kaggle/input/imagenes-png/kaggle/working/trai...
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False,/kaggle/input/imagenes-png/kaggle/working/trai...
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False,/kaggle/input/imagenes-png/kaggle/working/trai...
