<a href="https://colab.research.google.com/github/nachofranco17/ProyectoCNN/blob/main/ProyectoCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# --------------- Creando el directorio donde vamos a guardar el datset + imports necesarios --------------- #

import os, glob, json, textwrap
import pandas as pd
import numpy as np
from pathlib import Path
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

DRIVE_ROOT = "/content/drive/MyDrive"
PROJ_ROOT  = f"{DRIVE_ROOT}/ProyectoCNNv2"
os.makedirs(PROJ_ROOT, exist_ok=True)

Mounted at /content/drive


In [4]:
# --- Construyendo un diccionario {basename -> ruta absoluta} que recorra images_001/images/*.png para acceder luego más rápido a las fotos ---

img_dirs = sorted(glob.glob(os.path.join(PROJ_ROOT, "images_*", "images")))
all_img_paths = []
for d in img_dirs:
    all_img_paths.extend(glob.glob(os.path.join(d, "*.png")))

img_index = {Path(p).name: p for p in all_img_paths}

print(f"Carpetas detectadas: {len(img_dirs)}")
print(f"Imágenes indexadas: {len(img_index):,}")

for i, (k,v) in enumerate(list(img_index.items())[:3]):
    print(i, "->", k, ":", v)

Carpetas detectadas: 12
Imágenes indexadas: 112,121
0 -> 00001089_000.png : /content/drive/MyDrive/ProyectoCNNv2/images_001/images/00001089_000.png
1 -> 00001090_000.png : /content/drive/MyDrive/ProyectoCNNv2/images_001/images/00001090_000.png
2 -> 00001088_020.png : /content/drive/MyDrive/ProyectoCNNv2/images_001/images/00001088_020.png


In [None]:
meta_path = os.path.join(PROJ_ROOT, "Data_Entry_2017.csv")
df = pd.read_csv(meta_path)

# Normalizar nombres de columnas
df.columns = [c.strip().replace(" ", "_") for c in df.columns]

# Clases del ChestX-ray14 (orden canónico)
CLASSES = [
    "Atelectasis","Cardiomegaly","Consolidation","Edema","Effusion",
    "Emphysema","Fibrosis","Hernia","Infiltration","Mass",
    "Nodule","Pleural_Thickening","Pneumonia","Pneumothorax"
]

# Expandir Finding_Labels a columnas binarias (multi-label)
def labels_to_vec(s):
    labs = [x.strip() for x in s.split("|")]
    vec = {c: 0 for c in CLASSES}
    if labs == ["No Finding"]:
        return vec
    for x in labs:
        if x in vec:
            vec[x] = 1
    return vec

labels_expanded = df["Finding_Labels"].apply(labels_to_vec).apply(pd.Series)
df_ml = pd.concat([df[["Image_Index","Patient_ID","Finding_Labels"]], labels_expanded], axis=1)

# Agregar ruta absoluta del archivo
df_ml["img_path"] = df_ml["Image_Index"].map(img_index.get)

# Señalar imágenes faltantes físicamente (si hubiera)
missing_imgs = df_ml["img_path"].isna().sum()
print("Filas totales:", len(df_ml))
print("Imágenes con ruta encontrada:", len(df_ml) - missing_imgs)
print("Imágenes faltantes (no halladas en carpetas images_*):", missing_imgs)

df_ml.head(3)