In [2]:
import pandas as pd
import os
from google.colab import drive
from sklearn.model_selection import train_test_split

In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Configurar rutas
csv_path = '/content/drive/MyDrive/DatasetTIC/bcn20000_metadata_2025-07-22.csv'
images_folder = '/content/drive/MyDrive/DatasetTIC/ISIC-images'
output_folder = '/content/drive/MyDrive/Modelo_CNN'

os.makedirs(output_folder, exist_ok=True)

# Nombres de columnas en tu CSV
ID_COLUMN = 'isic_id'  # Columna con el ID/nombre de la imagen
DIAGNOSIS_COLUMN = 'diagnosis_1'  # Columna con el diagnóstico

In [None]:
df = pd.read_csv(csv_path)
print(f"Total de imágenes: {len(df)}")
print(f"Distribución de clases:\n{df['diagnosis_1'].value_counts()}")

Total de imágenes: 18946
Distribución de clases:
diagnosis_1
Malignant        8871
Benign           7831
Indeterminate    1088
Name: count, dtype: int64


In [None]:
#LIMPIEZA Y ETIQUETADO BINARIO

# Filtrar solo Benign y Malignant
df_clean = df[df[DIAGNOSIS_COLUMN].isin(['Benign', 'Malignant'])].copy()


# Crear etiquetas binarias: Benign=0, Malignant=1
label_mapping = {
    'Benign': 0,
    'Malignant': 1
}

df_clean['label'] = df_clean[DIAGNOSIS_COLUMN].map(label_mapping)

# Solo columnas necesarias: filepath y label
df_simple = pd.DataFrame({
    'filepath': df_clean[ID_COLUMN].apply(lambda x: f"{images_folder}/{x}.jpg"),
    'label': df_clean['label']
})

print(f"\nTotal de registros: {len(df_simple):,}")
print(f"\nDistribución de clases:")
class_counts_clean = df_simple['label'].value_counts()
print(class_counts_clean)
print(f"\nProporción:")
print(df_simple['label'].value_counts(normalize=True) * 100)




Total de registros: 16,702

Distribución de clases:
label
1    8871
0    7831
Name: count, dtype: int64

Proporción:
label
1    53.1134
0    46.8866
Name: proportion, dtype: float64


In [None]:
# DIVISION

# Primero: 70% train, 30% temp
train_df, temp_df = train_test_split(
    df_simple,
    test_size=0.30,
    stratify=df_simple['label'],
    random_state=42,
    shuffle=True
)

# Segundo: dividir temp en 50-50 (15% val, 15% test del total)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df['label'],
    random_state=42,
    shuffle=True
)


print(f"\nTrain: {len(train_df)} ({len(train_df)/len(df_simple)*100:.1f}%)")
print(f"Val: {len(val_df)} ({len(val_df)/len(df_simple)*100:.1f}%)")
print(f"Test: {len(test_df)} ({len(test_df)/len(df_simple)*100:.1f}%)")


Train: 11691 (70.0%)
Val: 2505 (15.0%)
Test: 2506 (15.0%)


In [None]:
# Verificar estratificación
print("\n--- Distribución por split ---")
for name, split_df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
    print(f"{name}:\n{split_df['label'].value_counts(normalize=True)}\n")


--- Distribución por split ---
Train:
label
1    0.531092
0    0.468908
Name: proportion, dtype: float64

Val:
label
1    0.531337
0    0.468663
Name: proportion, dtype: float64

Test:
label
1    0.531125
0    0.468875
Name: proportion, dtype: float64



In [None]:
# Guardar CSVs

# Crear subcarpeta para los CSVs
csv_folder = os.path.join(output_folder, "csv_splits")
os.makedirs(csv_folder, exist_ok=True)

train_df.to_csv(os.path.join(csv_folder, "train.csv"), index=False)
val_df.to_csv(os.path.join(csv_folder, "val.csv"), index=False)
test_df.to_csv(os.path.join(csv_folder, "test.csv"), index=False)

In [None]:
import json

# Crear archivo de configuración y label mapping
config = {
    'total_images': len(df),
    'train_size': len(train_df),
    'val_size': len(val_df),
    'test_size': len(test_df),
    'random_seed': 42,
    'stratify_column': DIAGNOSIS_COLUMN,
    'images_folder': images_folder,
    'class_distribution': df[DIAGNOSIS_COLUMN].value_counts().to_dict()
}

# Guardar mapeo de etiquetas
mapping_path = os.path.join(output_folder, "label_mapping.json")
with open(mapping_path, "w") as f:
    json.dump(label_mapping, f, indent=4)

# Guardar info del split
split_info_path = os.path.join(output_folder, "split_info.json")
with open(split_info_path, "w") as f:
    json.dump(config, f, indent=2)