### ResNet

#### Librerías:

In [None]:
from torchvision import transforms

transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Redimensionar a 224x224
    transforms.ToTensor(),  # Convertir a tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalizar
])


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision.models as models
import torchvision.transforms as transforms
from torchvision.models import ResNet50_Weights
from PIL import Image

In [2]:
import os

# Obtener el directorio actual
current_directory = os.getcwd()
data_directory = current_directory[:-3] + 'data\\'

In [3]:
# Cargar ResNet preentrenado
resnet = models.resnet50(weights=ResNet50_Weights.DEFAULT)
resnet = nn.Sequential(*list(resnet.children())[:-1])  # Quitar la última capa de clasificación
resnet.eval()

# Transformación para las imágenes
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [4]:
# Ejemplo: Extraer características de una imagen
image = Image.open(f'{data_directory}images\\images\\1.png')
image_tensor = transform(image).unsqueeze(0)  # Añadir dimensión de batch
with torch.no_grad():
    image_features = resnet(image_tensor).squeeze()  # Embedding de la imagen
print("Dimensión de las características de la imagen:", image_features.shape)

Dimensión de las características de la imagen: torch.Size([2048])


In [None]:
import os
import pandas as pd
from torch.utils.data import Dataset
from PIL import Image
import torch

import os
from PIL import Image

import os
from PIL import Image

class MultiLabelImageDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        """
        Args:
            csv_file (str): Ruta del archivo CSV con los nombres de archivo y etiquetas.
            img_dir (str): Ruta de la carpeta donde están las imágenes.
            transform (callable, optional): Transformaciones para aplicar a las imágenes.
        """
        self.data = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Obtener el ID del archivo desde el CSV
        img_id = self.data.iloc[idx, 0]  # Primera columna del CSV
        
        # Verificar extensiones posibles si no está incluida en el ID
        extensions = ["", ".jpg", ".png", ".tif"]
        img_path = None

        for ext in extensions:
            temp_path = os.path.join(self.img_dir, img_id + ext)
            if os.path.exists(temp_path):
                img_path = temp_path
                break

        if img_path is None:
            raise FileNotFoundError(
                f"No se encontró la imagen para el ID '{img_id}' con las extensiones {extensions} en {self.img_dir}."
            )

        # Cargar la imagen
        image = Image.open(img_path).convert("RGB")  # Convertir a RGB
        if self.transform:
            image = self.transform(image)

        # Etiquetas multi-label
        labels = torch.tensor(self.data.iloc[idx, 1:].values.astype(float))

        return image, labels

from torchvision import transforms

transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Redimensionar a 224x224
    transforms.ToTensor(),  # Convertir a tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalizar
])


In [23]:
from torch.utils.data import DataLoader

# Rutas de los archivos
train_csv = f"{data_directory}train_data.csv"
test_csv = f"{data_directory}test_data.csv"
img_dir = f"{data_directory}images/images"

# Crear datasets
train_dataset = MultiLabelImageDataset(csv_file=train_csv, img_dir=img_dir, transform=transform)
test_dataset = MultiLabelImageDataset(csv_file=test_csv, img_dir=img_dir, transform=transform)

# Crear DataLoaders
batch_size = 32  # Tamaño de los mini-lotes
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
class AdaptiveAttention(nn.Module):
    def __init__(self, d_model, nhead, dropout=0.1):
        super(AdaptiveAttention, self).__init__()
        self.d_model = d_model
        self.nhead = nhead

        # Proyecciones lineales para Q, K, V
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)

        # Parámetro adicional para la máscara adaptativa
        self.adaptive_mask = nn.Parameter(torch.zeros(nhead, d_model // nhead))

        # Softmax para calcular atención
        self.softmax = nn.Softmax(dim=-1)

        # Dropout y proyección final
        self.dropout = nn.Dropout(dropout)
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        batch_size, seq_len, _ = x.size()

        # Calcular Q, K, V
        Q = self.query(x).view(batch_size, seq_len, self.nhead, -1).transpose(1, 2)
        K = self.key(x).view(batch_size, seq_len, self.nhead, -1).transpose(1, 2)
        V = self.value(x).view(batch_size, seq_len, self.nhead, -1).transpose(1, 2)

        # Escalar y calcular similitud
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_model ** 0.5)

        # Aplicar máscara adaptativa
        scores = scores + self.adaptive_mask.unsqueeze(0).unsqueeze(2)

        # Aplicar máscara si es necesario
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        # Calcular atención
        attention = self.softmax(scores)
        attention = self.dropout(attention)

        # Combinar valores
        x = torch.matmul(attention, V).transpose(1, 2).contiguous()
        x = x.view(batch_size, seq_len, -1)
        x = self.out_proj(x)

        return x
    
class AdaptiveTransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super(AdaptiveTransformerEncoderLayer, self).__init__()
        self.adaptive_attention = AdaptiveAttention(d_model, nhead, dropout)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(dim_feedforward, d_model),
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask=None):
        # Atención adaptativa
        src2 = self.adaptive_attention(src, mask=src_mask)
        src = src + self.dropout(src2)
        src = self.norm1(src)

        # Feedforward
        src2 = self.feed_forward(src)
        src = src + self.dropout(src2)
        src = self.norm2(src)

        return src


In [None]:
from torch.nn import TransformerEncoder

# Parámetros del Transformer
embedding_dim = 2048  # Dimensión de salida de ResNet
num_heads = 8
num_layers = 6

# Construir TransformerEncoder con capas personalizadas
encoder_layers = [AdaptiveTransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads) for _ in range(num_layers)]
transformer_encoder = TransformerEncoder(nn.ModuleList(encoder_layers), num_layers=num_layers)

# Reemplaza el forward del Transformer
class AdaptiveAttentionClassifier(nn.Module):
    def __init__(self, image_feature_dim, num_classes):
        super(AdaptiveAttentionClassifier, self).__init__()
        self.image_embed = nn.Linear(image_feature_dim, 512)  # Embedding de características
        self.transformer_encoder = nn.ModuleList([
            AdaptiveTransformerEncoderLayer(d_model=512, nhead=8) for _ in range(6)
        ])
        self.mlp = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes),
        )

    def forward(self, image_features):
        # Proyección de características
        image_embeddings = self.image_embed(image_features)

        # Pasar por las capas del Transformer manualmente
        for layer in self.transformer_encoder:
            image_embeddings = layer(image_embeddings.unsqueeze(1)).squeeze(1)

        # Clasificador final
        output = self.mlp(image_embeddings)
        return output





In [25]:
# Crear embeddings aleatorios para etiquetas y estados
label_embeddings = torch.nn.Embedding(num_embeddings=5, embedding_dim=128)  # Ejemplo: 5 etiquetas (DR, NORMAL, etc.)
state_embeddings = torch.nn.Embedding(num_embeddings=3, embedding_dim=128)  # Ejemplo: 3 estados (YES, NO, ?)

# Etiqueta y estado de entrada
label = torch.tensor([0])  # "DR"
state = torch.tensor([1])  # "NO"

# Obtener embeddings
label_embed = label_embeddings(label)
state_embed = state_embeddings(state)
print("Embedding de etiqueta:", label_embed.shape)
print("Embedding de estado:", state_embed.shape)


Embedding de etiqueta: torch.Size([1, 128])
Embedding de estado: torch.Size([1, 128])


In [18]:
# Concatenar embeddings
combined_embeddings = torch.cat([image_features, label_embed.squeeze(), state_embed.squeeze()], dim=-1)
print("Dimensión de los embeddings combinados:", combined_embeddings.shape)

Dimensión de los embeddings combinados: torch.Size([2304])


In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Modelo (asegúrate de tener tu modelo configurado correctamente)
model = AdaptiveAttentionClassifier(image_feature_dim=2048, num_classes=train_dataset[0][1].size(0))
model.to(device)

# Configuración de optimizador y función de pérdida
criterion = torch.nn.BCEWithLogitsLoss()  # Para clasificación multi-label
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 1  # Número de épocas

for epoch in range(num_epochs):
    # Entrenamiento
    model.train()
    train_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        # Extraer características de ResNet
        with torch.no_grad():
            image_features = resnet(images).flatten(start_dim=1)

        # Forward pass
        outputs = model(image_features)
        loss = criterion(outputs, labels)
        train_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validación
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)

            image_features = resnet(images).flatten(start_dim=1)
            outputs = model(image_features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    # Promedio de pérdidas
    train_loss /= len(train_loader)
    val_loss /= len(test_loader)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")




AttributeError: 'ModuleList' object has no attribute 'self_attn'

In [None]:
.

In [27]:
csv_file = f"{data_directory}train_data.csv"
img_dir = f"{data_directory}images/images"

data = pd.read_csv(csv_file)
missing_files = []

for img_id in data['ID']:
    file_found = any(
        os.path.exists(os.path.join(img_dir, img_id + ext))
        for ext in ["", ".jpg", ".png", ".tif"]
    )
    if not file_found:
        missing_files.append(img_id)

if missing_files:
    print("Los siguientes archivos no se encontraron en el directorio:")
    print(missing_files)
else:
    print("Todos los archivos del CSV están presentes.")


Todos los archivos del CSV están presentes.


---

In [None]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

# Cargar el modelo y el procesador
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Ejemplo: Imagen y texto de entrada
image = Image.open("example.jpg")  # Reemplaza con tu imagen
text = ["Este es un ejemplo", "Otra descripción"]

# Procesar entradas
inputs = processor(text=text, images=image, return_tensors="pt", padding=True)

# Hacer el cálculo de similitud
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # Similaridad entre imagen y texto
probs = logits_per_image.softmax(dim=1)  # Convertir a probabilidades

print("Probs:", probs)

##################################################################################
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image

# Cargar el modelo AlexNet preentrenado
alexnet = models.alexnet(pretrained=True)
alexnet.eval()

# Preprocesamiento de la imagen
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

image = Image.open("path_to_your_image.jpg")
image_tensor = transform(image).unsqueeze(0)  # Agregar batch dimension

# Extraer características de la imagen (penúltima capa)
with torch.no_grad():
    features = alexnet.features(image_tensor).flatten(start_dim=1)

print("Características de la imagen:", features.shape)


In [None]:
from transformers import CLIPProcessor, CLIPModel

# Cargar modelo CLIP
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Preprocesar imagen
image = Image.open("path_to_image.jpg")
inputs = clip_processor(images=image, return_tensors="pt")
with torch.no_grad():
    image_features = clip_model.get_image_features(**inputs)  # Embedding de la imagen
print("Dimensión de las características de la imagen:", image_features.shape)
