In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import tensorflow as tf
# Modelo preentrenado
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model


In [None]:
train_df = pd.read_csv('train.csv')

test_df = pd.read_csv('test.csv')


base_path='train_images'

# saving image path into train dataframe
train_df['img_path']= f'{base_path}'\
                    + '/' + train_df.patient_id.astype(str)\
                    + '/' + train_df.image_id.astype(str)\
                    + '.png'



display(train_df.head(3))

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,img_path
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False,train_images/10006/462822612.png
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False,train_images/10006/1459541791.png
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False,train_images/10006/1864590858.png


In [None]:
patients_with_nan_BIRADS = train_df[train_df['BIRADS'].isna()]
patients_with_cancer_nan_BIRADS = patients_with_nan_BIRADS[patients_with_nan_BIRADS['cancer'] == 1]

# Cantidad de pancientes con cancer y clasificación BIRADS nan
print(len(patients_with_cancer_nan_BIRADS))

494


In [None]:
# Analizar la distribución de BI-RADS (asume que es una columna llamada 'BIRADS')
birads = train_df['BIRADS'].dropna()

In [None]:
# Calcular las probabilidades dinámicamente
available_birads = train_df['BIRADS'].dropna()  # Valores disponibles
birads_probs = available_birads.value_counts(normalize=True).sort_index().to_list()

# Función para simular valores faltantes
def simulate_categorical_birads(size, probs):
    categories = [0, 1, 2]  # Los valores posibles de BI-RADS
    simulated_values = np.random.choice(categories, size=size, p=probs)
    return simulated_values

np.random.seed(42)  # Para reproducibilidad

# Identificar los índices con valores faltantes
missing_indices = train_df[train_df['BIRADS'].isnull()].index
num_missing = len(missing_indices)

# Simular los valores faltantes usando las probabilidades dinámicas
simulated_birads = simulate_categorical_birads(num_missing, birads_probs)

# Imputar los valores simulados
train_df.loc[missing_indices, 'BIRADS'] = simulated_birads

In [None]:
#Cálculo de las probabilidades dinamicamente
# Filtrar los datos disponibles
available_density = train_df['density'].dropna()

# Calcular las frecuencias relativas
density_probs = available_density.value_counts(normalize=True).sort_index().to_list()
print(f"Probabilidades calculadas dinámicamente: {density_probs}")

Probabilidades calculadas dinámicamente: [0.10536138445877163, 0.4292840176450628, 0.41313199864268746, 0.052222599253478115]


In [None]:
minority_class_size = train_df['cancer'].value_counts().min()

# Definir la proporción de muestras de la clase mayoritaria (ejemplo: 2 veces la clase minoritaria)
majority_class_multiplier = 10
majority_class_size = minority_class_size * majority_class_multiplier

# Obtener todas las muestras de la clase minoritaria
minority_samples = train_df[train_df['cancer'] == 1]

# Obtener una muestra aleatoria de la clase mayoritaria
majority_samples = train_df[train_df['cancer'] == 0].sample(majority_class_size, random_state=42)

# Combinar las muestras de ambas clases
balanced_df = pd.concat([minority_samples, majority_samples]).sample(frac=1, random_state=42).reset_index(drop=True)

# Verificar el balance de clases
print(balanced_df['cancer'].value_counts())

cancer
0    11580
1     1158
Name: count, dtype: int64


In [None]:
# Cargar modelo una vez al inicio
model = VGG16(weights="imagenet")
grad_model = Model(inputs=model.inputs, outputs=[model.get_layer("block5_conv3").output, model.output])


In [None]:
# Función para generar heatmap
def generate_heatmap(image_path):
    print("Generating heatmap")
    img = cv2.imread(image_path)
    img = cv2.resize(img, (224, 224))  # Ajustar tamaño para el modelo
    img_array = np.expand_dims(img, axis=0)
    img_array = tf.keras.applications.vgg16.preprocess_input(img_array)

    with tf.GradientTape() as tape:
        conv_outputs, predictions = grad_model(img_array)
        class_idx = tf.argmax(predictions[0])
        loss = predictions[:, class_idx]

    grads = tape.gradient(loss, conv_outputs)[0]
    weights = tf.reduce_mean(grads, axis=(0, 1))
    heatmap = tf.reduce_sum(weights * conv_outputs[0], axis=-1)

        # Normalizar heatmap
    heatmap = np.maximum(heatmap, 0) / np.max(heatmap)
    if not isinstance(heatmap, np.ndarray):  # Si no es un array de NumPy, conviértelo
        heatmap = heatmap.numpy()
    heatmap = cv2.resize(heatmap, (512, 512))  # Redimensionar al tamaño original
    heatmap = (heatmap * 255).astype("uint8")
    return heatmap

# Superponer heatmap
def apply_heatmap(image_path, heatmap, alpha=0.6):
    print("Superponiendo heatmap")
    original_image = cv2.imread(image_path)
    heatmap_color = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
    overlay = cv2.addWeighted(heatmap_color, alpha, original_image, 1 - alpha, 0)
    return overlay

# Función para procesar una imagen
def process_image(row):
    print("Guardando heatmap")
    input_path = row['img_path']
    output_dir = "images_processed_heatmap"  # Cambiar por tu ruta de salida
    os.makedirs(output_dir, exist_ok=True)
    
    try:
        heatmap = generate_heatmap(input_path)
        transformed_image = apply_heatmap(input_path, heatmap)

        # Generar salida en la misma estructura
        relative_path = os.path.relpath(input_path, start="train_images")
        output_path = os.path.join(output_dir, relative_path)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        cv2.imwrite(output_path, transformed_image)
        return f"Procesada: {output_path}"
    except Exception as e:
        return f"Error en {input_path}: {str(e)}"


In [None]:
for _, row in balanced_df.iterrows():
    print(process_image(row))

Guardando heatmap
Generating heatmap




Superponiendo heatmap
Procesada: images_processed_heatmap\65030\1344606626.png
Guardando heatmap
Generating heatmap
Superponiendo heatmap
Procesada: images_processed_heatmap\31859\78460969.png
Guardando heatmap
Generating heatmap
Superponiendo heatmap
Procesada: images_processed_heatmap\7098\786606520.png
Guardando heatmap
Generating heatmap
Superponiendo heatmap
Procesada: images_processed_heatmap\31277\401242910.png
Guardando heatmap
Generating heatmap
Superponiendo heatmap
Procesada: images_processed_heatmap\22562\1491977729.png
Guardando heatmap
Generating heatmap
Superponiendo heatmap
Procesada: images_processed_heatmap\8570\2139613775.png
Guardando heatmap
Generating heatmap
Superponiendo heatmap
Procesada: images_processed_heatmap\18421\179426017.png
Guardando heatmap
Generating heatmap
Superponiendo heatmap
Procesada: images_processed_heatmap\17570\130190871.png
Guardando heatmap
Generating heatmap
Superponiendo heatmap
Procesada: images_processed_heatmap\9750\1208494128.png
Gu

In [None]:
base_path='images_processed_heatmap'

# saving image path into train dataframe
balanced_df['img_path']= balanced_df.patient_id.astype(str)\
                    + '/' + balanced_df.image_id.astype(str)\
                    + '.png'



display(balanced_df.head(3))

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,img_path,label
0,1,65030,1344606626,L,MLO,56.0,0,0,0,1.0,0,B,49,False,65030/1344606626.png,0
1,2,31859,78460969,R,CC,68.0,0,0,0,1.0,0,,48,False,31859/78460969.png,0
2,1,7098,786606520,R,CC,59.0,0,0,0,1.0,0,B,210,False,7098/786606520.png,0
