In [1]:
import os
import numpy as np
import pandas as pd
import pydicom
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import GlobalAveragePooling2D, LSTM, Dense, Input, Masking, TimeDistributed
from tensorflow.keras.models import Model
from PIL import Image
from torchinfo import summary 
import os
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchvision import transforms
from torchvision.transforms import ToPILImage
from sklearn.preprocessing import LabelEncoder
from scipy.ndimage import gaussian_filter
import seaborn as sns

In [3]:
class ResNetFeatureExtractor(nn.Module):
    def __init__(self):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet18(weights='IMAGENET1K_V1')
        self.features = nn.Sequential(*list(resnet.children())[:-2])

        for param in self.features[-2:].parameters():
            param.requires_grad = True

    def forward(self, x):
        features = self.features(x)
        return features.mean([2, 3])

class AttentionModule(nn.Module):
    def __init__(self, hidden_size):
        super(AttentionModule, self).__init__()
        self.attention = nn.Linear(hidden_size * 2, 1)

    def forward(self, x):
        scores = self.attention(x) 
        weights = torch.softmax(scores, dim=1)
        context = (x * weights).sum(dim=1)
        return context

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first=True,
                            bidirectional=True,
                            dropout=0.45)
        self.attention = AttentionModule(hidden_size)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size * 2, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        context = self.attention(lstm_out)
        output = self.classifier(context)
        return output

class SequenceClassificationModel(nn.Module):
    def __init__(self, lstm_hidden_size=512, lstm_num_layers=2):
        super(SequenceClassificationModel, self).__init__()
        self.feature_extractor = ResNetFeatureExtractor()
        self.sequence_classifier = LSTMClassifier(input_size=512, 
                                                  hidden_size=lstm_hidden_size,
                                                  num_layers=lstm_num_layers)

    def forward(self, x):
        batch_size, seq_len, c, h, w = x.size()
        x = x.view(batch_size * seq_len, c, h, w)
        features = self.feature_extractor(x)  
        features = features.view(batch_size, seq_len, -1)
        output = self.sequence_classifier(features)
        return output


In [None]:
model = torch.load("model_f1_0.7682.pth")
model.eval()

  model = torch.load("model_f1_0.7682.pth")
  model = torch.load("model_f1_complete.pth")


SequenceClassificationModel(
  (feature_extractor): ResNetFeatureExtractor(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (4): Sequential(
        (0): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplace=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=Fal

In [4]:
file_path = '../../Advanced-MRI-Breast-Lesions-DA-Clinical-Jan112024.csv'
dataDirectory = "../../Data/Advanced-MRI-Breast-Lesions"
separador = "=" * 100

excluded_implants = [
    'AMBL-002', 'AMBL-004', 'AMBL-015', 'AMBL-027',
    'AMBL-049', 'AMBL-566', 'AMBL-585', 'AMBL-592',
    'AMBL-617', 'AMBL-618', 'AMBL-620', 'AMBL-624',
    'AMBL-626']

In [5]:
df_subset = pd.read_csv(file_path, sep=';', skiprows=1)[['Patient ID', 'BIRADS']].dropna()
subfolders_ids = [folder for folder in os.listdir(dataDirectory) if os.path.isdir(os.path.join(dataDirectory, folder))]
df_subset = df_subset[~df_subset['Patient ID'].isin(excluded_implants)]
birads_replacements = {'-1': '0', '4A': '4', '3, 4A': '4'}
df_subset['BIRADS'] = df_subset['BIRADS'].replace(birads_replacements).astype(int)
df_subset = df_subset[df_subset['BIRADS'] != 0]
df_subset = df_subset[df_subset['Patient ID'].astype(str).isin(subfolders_ids)]
df_subset['malignant'] = (df_subset['BIRADS'] > 3).astype(int)
amount_pacients = len(df_subset)
birads_group_frequencies = df_subset['malignant'].value_counts().sort_index()
frequencies_table_grouped = pd.DataFrame({
    'Malignant': birads_group_frequencies.index,
    'Frecuencia': birads_group_frequencies.values
})

In [6]:
print(separador)
print("Tipo de BIRADS antes de la modificación:", df_subset['BIRADS'].dtype)
print("Valores únicos de BIRADS antes de la modificación:", df_subset['BIRADS'].unique())
print(separador)
print("Tipo de BIRADS después de la conversión:", df_subset['BIRADS'].dtype)
print("Valores únicos de BIRADS:", df_subset['BIRADS'].unique())
print(separador)
print(f'Dimensiones de los datos: {df_subset.shape}')
print(separador)
print(frequencies_table_grouped)
print(separador)

Tipo de BIRADS antes de la modificación: int32
Valores únicos de BIRADS antes de la modificación: [4 6 2 3 5 1]
Tipo de BIRADS después de la conversión: int32
Valores únicos de BIRADS: [4 6 2 3 5 1]
Dimensiones de los datos: (185, 3)
   Malignant  Frecuencia
0          0          62
1          1         123


In [7]:
def load_dicom_image(file_path):
    try:
        dicom = pydicom.dcmread(file_path)
        if not hasattr(dicom, "pixel_array"):
            raise ValueError(f"El archivo {file_path} no contiene datos de imagen.")
        image = dicom.pixel_array.astype(np.float32)
        intercept = getattr(dicom, "RescaleIntercept", 0)
        slope = getattr(dicom, "RescaleSlope", 1)
        image = image * slope + intercept
        return image

    except Exception as e:
        print(f"Error al cargar la imagen DICOM ({file_path}): {e}")
        return None

def normalize_for_resnet(image):
    p25, p95 = np.percentile(image, (25, 95))
    image = np.clip(image, p25, p95)

    if image.max() == image.min():
        image = np.zeros_like(image)
    else:
        image = (image - image.min()) / (image.max() - image.min())

    image_tensor = torch.tensor(image, dtype=torch.float32).unsqueeze(0)
    image_tensor = image_tensor.repeat(3, 1, 1)

    return image_tensor

In [8]:

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

In [59]:
class BreastCancerDataset(Dataset):
    def __init__(self, dataframe, root_dir, transform=None, minority_transform=None, max_images=116, image_size=224):
        self.dataframe = dataframe
        self.root_dir = root_dir
        self.transform = test_transform
        self.minority_transform = test_transform
        self.max_images = max_images
        self.to_pil = ToPILImage()
        self.image_size = image_size

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        patient_id = self.dataframe.iloc[idx]['Patient ID']
        label = self.dataframe.iloc[idx]['malignant']
        patient_dir = os.path.join(self.root_dir, patient_id)
        
        dicom_files = sorted([f for f in os.listdir(patient_dir) if f.endswith('.dcm')])

        images = []
        for file in dicom_files[:self.max_images]:
            image_path = os.path.join(patient_dir, file)

            image = load_dicom_image(image_path)
            if image is not None:
                image = normalize_for_resnet(image)
                image = self.to_pil(image)
                if self.minority_transform and label == 0:
                    image = self.minority_transform(image)
                elif self.transform:
                    image = self.transform(image)

                images.append(image)

        sequence_length = len(images)

        if len(images) == 0:
            placeholder_image = torch.zeros((3, self.image_size, self.image_size))
            images = [placeholder_image] * self.max_images
        while len(images) < self.max_images:
            images.append(torch.zeros_like(images[0]))

        images_tensor = torch.stack(images)
        return images_tensor, sequence_length, label

In [60]:
dataset = BreastCancerDataset(dataframe=df_subset, root_dir=dataDirectory, transform=test_transform)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

# Inicializar las listas para las métricas
y_true = []
y_pred = []



In [61]:
with torch.no_grad():
    total_batches = len(dataloader)  # Número total de lotes
    for batch_idx, (images, seq_len, labels) in enumerate(dataloader):
        print(f"Procesando lote {batch_idx + 1}/{total_batches}...")

        output = model(images)

        # Aplicamos el umbral de 0.5 y convertimos a int
        predicted = (output > 0.5).int()

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

        remaining_batches = total_batches - (batch_idx + 1)
        print(f"Quedan {remaining_batches} lotes.")


Procesando lote 1/185...
Quedan 184 lotes.
Procesando lote 2/185...
Quedan 183 lotes.
Procesando lote 3/185...
Quedan 182 lotes.
Procesando lote 4/185...
Quedan 181 lotes.
Procesando lote 5/185...
Quedan 180 lotes.
Procesando lote 6/185...
Quedan 179 lotes.
Procesando lote 7/185...
Quedan 178 lotes.
Procesando lote 8/185...
Quedan 177 lotes.
Procesando lote 9/185...
Quedan 176 lotes.
Procesando lote 10/185...
Quedan 175 lotes.
Procesando lote 11/185...
Quedan 174 lotes.
Procesando lote 12/185...
Quedan 173 lotes.
Procesando lote 13/185...
Quedan 172 lotes.
Procesando lote 14/185...
Quedan 171 lotes.
Procesando lote 15/185...
Quedan 170 lotes.
Procesando lote 16/185...
Quedan 169 lotes.
Procesando lote 17/185...
Quedan 168 lotes.
Procesando lote 18/185...
Quedan 167 lotes.
Procesando lote 19/185...
Quedan 166 lotes.
Procesando lote 20/185...
Quedan 165 lotes.
Procesando lote 21/185...
Quedan 164 lotes.
Procesando lote 22/185...
Quedan 163 lotes.
Procesando lote 23/185...
Quedan 162 lote

In [65]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
print("Classification Report:")
print(classification_report(y_true, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.76      0.81        62
           1       0.89      0.94      0.91       123

    accuracy                           0.88       185
   macro avg       0.88      0.85      0.86       185
weighted avg       0.88      0.88      0.88       185



In [12]:
random_row = df_subset.sample(n=1) 

print(random_row)

   Patient ID  BIRADS  malignant
70   AMBL-071       4          1


In [9]:
class BreastCancerPredictionDataset(Dataset):
    def __init__(self, patient_id, root_dir, max_images=116, image_size=224, transform=None):
        self.patient_id = patient_id
        self.root_dir = root_dir
        self.max_images = max_images
        self.image_size = image_size
        self.transform = transform
        self.to_pil = ToPILImage()

    def __len__(self):
        return 1

    def __getitem__(self, idx):
        patient_id = self.patient_id
        patient_dir = os.path.join(self.root_dir, patient_id)
        
        dicom_files = sorted([f for f in os.listdir(patient_dir) if f.endswith('.dcm')])

        images = []
        for file in dicom_files[:self.max_images]:
            image_path = os.path.join(patient_dir, file)
            image = load_dicom_image(image_path) 
            if image is not None:
                image = normalize_for_resnet(image)
                image = self.to_pil(image)
                if self.transform:
                    image = self.transform(image)
                
                images.append(image)

        sequence_length = len(images)

        if len(images) == 0:
            placeholder_image = torch.zeros((3, self.image_size, self.image_size))
            images = [placeholder_image] * self.max_images
        while len(images) < self.max_images:
            images.append(torch.zeros_like(images[0]))

        images_tensor = torch.stack(images)
        return images_tensor, sequence_length


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:

def predict_single_patient(patient_id, model, root_dir, transform=None):

    dataset_single_patient = BreastCancerPredictionDataset(patient_id, root_dir, transform=transform)

    dataloader_single_patient = DataLoader(dataset_single_patient, batch_size=1)

    model.eval() 
    with torch.no_grad(): 
        for images, _ in dataloader_single_patient:
            images = images.to(device) 
            outputs = model(images)
            predicted = (outputs > 0.5).int() 
    return predicted[0].item() 


patient_id = random_row["Patient ID"].item() 
prediction = predict_single_patient(patient_id, model, dataDirectory, transform=test_transform)

print(f'Predicción para el paciente {patient_id}: {prediction}')


Predicción para el paciente AMBL-071: 1


In [14]:
torch.save(model, "model_f1_0.7682.pth")