In [1]:
import os
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
from torchvision import transforms
from torchvision.models import resnet50, resnet101, resnet152, ResNet50_Weights,ResNet101_Weights, ResNet152_Weights
from sklearn.metrics import multilabel_confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Visualizar una matriz de confusión para una etiqueta específica
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from SupportFunc.Loader import MultiLabelImageDataset
from SupportFunc.AdaptiveAttention import AdaptiveAttention, AdaptiveTransformerEncoderLayer
from SupportFunc.Visualization import plot_confusion_matrix, plot_train_val_curve

class AdaptiveAttentionClassifier(nn.Module):
    def __init__(self, image_feature_dim, num_classes):
        super(AdaptiveAttentionClassifier, self).__init__()
        self.image_embed = nn.Linear(image_feature_dim, 64)
        self.transformer_encoder = nn.ModuleList([
            AdaptiveTransformerEncoderLayer(d_model=64, nhead=8) for _ in range(1)
        ])
        self.mlp = nn.Sequential(
            nn.Linear(64, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes),
        )

    def forward(self, image_features):
        # Proyección de características
        image_embeddings = self.image_embed(image_features)

        # Pasar por las capas del Transformer
        for layer in self.transformer_encoder:
            image_embeddings = layer(image_embeddings.unsqueeze(1)).squeeze(1)

        # Clasificador final
        output = self.mlp(image_embeddings)
        return output

# Obtener el directorio actual
current_directory = os.getcwd()
data_directory = os.path.join(current_directory[:-3], 'data')
images_directory = os.path.join(data_directory, 'images')

num_epochs = 10
batch_size = 32
lr = 0.0001
image_feature_dim = 2048

subset = [os.path.splitext(filename)[0] for filename in os.listdir(images_directory)]
print(len(subset))

# Transformaciones para las imágenes
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Leer y filtrar datos
train_1 = pd.read_csv(f'{data_directory}/train_data.csv')
test_1 = pd.read_csv(f'{data_directory}/test_data.csv')

train_1 = train_1[train_1['ID'].isin(subset[:500])]
test_1 = test_1[test_1['ID'].isin(subset[:500])]

# Guardar los archivos filtrados
train_1.to_csv(f'{data_directory}/train_data_2.csv', encoding = 'utf-8', index=False)
train_1.to_csv(f'{data_directory}/test_data_2.csv', encoding = 'utf-8', index=False)

# Crear datasets y DataLoaders
train_dataset = MultiLabelImageDataset(csv_file=f"{data_directory}/train_data_2.csv", img_dir=f"{images_directory}/", transform=transform)
test_dataset = MultiLabelImageDataset(csv_file=f"{data_directory}/test_data_2.csv", img_dir=f"{images_directory}/", transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Modelo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet = models.resnet50(weights=ResNet50_Weights.DEFAULT)
#resnet = models.resnet101(weights=ResNet101_Weights.DEFAULT)  # Para ResNet101
#resnet = models.resnet152(weights=ResNet152_Weights.DEFAULT)  # Para ResNet152

# Congelar todas las capas inicialmente
for param in resnet.parameters():
    param.requires_grad = False

# Descongelar las capas que quieres ajustar (por ejemplo, 'layer4' o 'fc')
for param in resnet.layer4.parameters():  # Afinar el bloque final
    param.requires_grad = True
    
# Reemplazar la capa final con una capa adaptada a tu número de clases
num_classes = train_dataset[0][1].size(0)  # Obtener el número de clases desde el dataset
resnet.fc = nn.Identity()
for param in resnet.fc.parameters():
    param.requires_grad = True

resnet = resnet.to(device)

model = AdaptiveAttentionClassifier(image_feature_dim=image_feature_dim, num_classes=num_classes)
model = model.to(device)

# Optimizador y función de pérdida
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, resnet.parameters()), lr=lr)

#Comprobar que las dimensiones de salida de resnet sean consistentes con la siguiente capa
images, labels = next(iter(train_loader))
images, labels = images.to(device), labels.to(device)
image_features = resnet(images)
print(image_features.shape)

# Listas para almacenar pérdidas y métricas
train_losses = list()
val_losses = list()
train_accuracies = list()
val_accuracies = list()
train_f1_scores = list()
val_f1_scores = list()

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    all_train_labels = []
    all_train_preds = []

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        image_features = resnet(images).flatten(start_dim=1)
        outputs = model(image_features)

        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        # Calcular predicciones para métricas
        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).int()  # Umbral de 0.5 para predicciones binarias
        all_train_labels.append(labels.cpu().numpy())
        all_train_preds.append(preds.cpu().numpy())

    # Calcular métricas para entrenamiento
    all_train_labels = np.vstack(all_train_labels)
    all_train_preds = np.vstack(all_train_preds)
    train_accuracy = accuracy_score(all_train_labels, all_train_preds)
    train_f1 = f1_score(all_train_labels, all_train_preds, average="macro", zero_division=1)

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(train_accuracy)
    train_f1_scores.append(train_f1)

    # Validación
    model.eval()
    val_loss = 0.0
    all_val_labels = []
    all_val_preds = []

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            image_features = resnet(images).flatten(start_dim=1)
            outputs = model(image_features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Calcular predicciones para métricas
            probs = torch.sigmoid(outputs)
            preds = (probs > 0.5).int()  # Umbral de 0.5 para predicciones binarias
            all_val_labels.append(labels.cpu().numpy())
            all_val_preds.append(preds.cpu().numpy())

    # Calcular métricas para validación
    all_val_labels = np.vstack(all_val_labels)
    all_val_preds = np.vstack(all_val_preds)
    val_accuracy = accuracy_score(all_val_labels, all_val_preds)
    val_f1 = f1_score(all_val_labels, all_val_preds, average="macro", zero_division=1)

    val_losses.append(val_loss / len(test_loader))
    val_accuracies.append(val_accuracy)
    val_f1_scores.append(val_f1)

    # Imprimir métricas
    print(
        f"Epoch {epoch+1}/{num_epochs}, "
        f"Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}, "
        f"Train Acc: {train_accuracies[-1]:.4f}, Val Acc: {val_accuracies[-1]:.4f}, "
        f"Train F1: {train_f1_scores[-1]:.4f}, Val F1: {val_f1_scores[-1]:.4f}"
    )
    
    # Coloca el modelo en modo evaluación
model.eval()

# Listas para almacenar etiquetas reales y predicciones
all_labels = list()
all_preds = list()

# Realizar inferencia
with torch.no_grad():  # No calculamos gradientes
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        # Extraer características con ResNet y pasar por el modelo
        image_features = resnet(images).flatten(start_dim=1)
        outputs = model(image_features)

        # Aplicar sigmoid para obtener probabilidades
        probs = torch.sigmoid(outputs)

        # Convertir probabilidades en predicciones binarias (umbral = 0.5)
        preds = (probs > 0.5).int()

        # Guardar etiquetas reales y predicciones
        all_labels.append(labels.cpu().numpy())
        all_preds.append(preds.cpu().numpy())

# Convertir listas a arreglos
all_labels = np.vstack(all_labels)  # Etiquetas reales
all_preds = np.vstack(all_preds)    # Predicciones
test_accuracy = accuracy_score(all_labels, all_preds)
test_f1 = f1_score(all_labels, all_preds, average="macro", zero_division=1)
# Calcular matriz de confusión para cada etiqueta
confusion_matrices = multilabel_confusion_matrix(all_labels, all_preds)

print(f'test_accuracy: {test_accuracy}, test_f1: {test_f1}')

2451
torch.Size([32, 2048])
Epoch 1/10, Train Loss: 0.6432, Val Loss: 0.5901, Train Acc: 0.0000, Val Acc: 0.0000, Train F1: 0.0715, Val F1: 0.1423
Epoch 2/10, Train Loss: 0.5817, Val Loss: 0.5612, Train Acc: 0.0027, Val Acc: 0.0027, Train F1: 0.1355, Val F1: 0.1645
Epoch 3/10, Train Loss: 0.5625, Val Loss: 0.5486, Train Acc: 0.0134, Val Acc: 0.0429, Train F1: 0.2252, Val F1: 0.2228
Epoch 4/10, Train Loss: 0.5540, Val Loss: 0.5426, Train Acc: 0.0188, Val Acc: 0.0375, Train F1: 0.2202, Val F1: 0.2376
Epoch 5/10, Train Loss: 0.5507, Val Loss: 0.5398, Train Acc: 0.0349, Val Acc: 0.0456, Train F1: 0.2354, Val F1: 0.2421
Epoch 6/10, Train Loss: 0.5482, Val Loss: 0.5383, Train Acc: 0.0349, Val Acc: 0.0483, Train F1: 0.2147, Val F1: 0.2471
Epoch 7/10, Train Loss: 0.5470, Val Loss: 0.5371, Train Acc: 0.0349, Val Acc: 0.0483, Train F1: 0.2175, Val F1: 0.2527
Epoch 8/10, Train Loss: 0.5462, Val Loss: 0.5362, Train Acc: 0.0483, Val Acc: 0.0617, Train F1: 0.2258, Val F1: 0.2598
Epoch 9/10, Train Lo

In [2]:
import os
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
from torchvision import transforms
from torchvision.models import resnet50, resnet101, resnet152, ResNet50_Weights,ResNet101_Weights, ResNet152_Weights
from sklearn.metrics import multilabel_confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Visualizar una matriz de confusión para una etiqueta específica
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from SupportFunc.Loader import MultiLabelImageDataset
from SupportFunc.AdaptiveAttention import AdaptiveAttention, AdaptiveTransformerEncoderLayer
from SupportFunc.Visualization import plot_confusion_matrix, plot_train_val_curve

class AdaptiveAttentionClassifier(nn.Module):
    def __init__(self, image_feature_dim, num_classes):
        super(AdaptiveAttentionClassifier, self).__init__()
        self.image_embed = nn.Linear(image_feature_dim, 128)
        self.transformer_encoder = nn.ModuleList([
            AdaptiveTransformerEncoderLayer(d_model=128, nhead=8) for _ in range(1)
        ])
        self.mlp = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes),
        )

    def forward(self, image_features):
        # Proyección de características
        image_embeddings = self.image_embed(image_features)

        # Pasar por las capas del Transformer
        for layer in self.transformer_encoder:
            image_embeddings = layer(image_embeddings.unsqueeze(1)).squeeze(1)

        # Clasificador final
        output = self.mlp(image_embeddings)
        return output

# Obtener el directorio actual
current_directory = os.getcwd()
data_directory = os.path.join(current_directory[:-3], 'data')
images_directory = os.path.join(data_directory, 'images')

num_epochs = 10
batch_size = 32
lr = 0.0001
image_feature_dim = 2048

subset = [os.path.splitext(filename)[0] for filename in os.listdir(images_directory)]
print(len(subset))

# Transformaciones para las imágenes
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Leer y filtrar datos
train_1 = pd.read_csv(f'{data_directory}/train_data.csv')
test_1 = pd.read_csv(f'{data_directory}/test_data.csv')

train_1 = train_1[train_1['ID'].isin(subset[:500])]
test_1 = test_1[test_1['ID'].isin(subset[:500])]

# Guardar los archivos filtrados
train_1.to_csv(f'{data_directory}/train_data_2.csv', encoding = 'utf-8', index=False)
train_1.to_csv(f'{data_directory}/test_data_2.csv', encoding = 'utf-8', index=False)

# Crear datasets y DataLoaders
train_dataset = MultiLabelImageDataset(csv_file=f"{data_directory}/train_data_2.csv", img_dir=f"{images_directory}/", transform=transform)
test_dataset = MultiLabelImageDataset(csv_file=f"{data_directory}/test_data_2.csv", img_dir=f"{images_directory}/", transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Modelo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet = models.resnet50(weights=ResNet50_Weights.DEFAULT)
#resnet = models.resnet101(weights=ResNet101_Weights.DEFAULT)  # Para ResNet101
#resnet = models.resnet152(weights=ResNet152_Weights.DEFAULT)  # Para ResNet152

# Congelar todas las capas inicialmente
for param in resnet.parameters():
    param.requires_grad = False

# Descongelar las capas que quieres ajustar (por ejemplo, 'layer4' o 'fc')
for param in resnet.layer4.parameters():  # Afinar el bloque final
    param.requires_grad = True
    
# Reemplazar la capa final con una capa adaptada a tu número de clases
num_classes = train_dataset[0][1].size(0)  # Obtener el número de clases desde el dataset
resnet.fc = nn.Identity()
for param in resnet.fc.parameters():
    param.requires_grad = True

resnet = resnet.to(device)

model = AdaptiveAttentionClassifier(image_feature_dim=image_feature_dim, num_classes=num_classes)
model = model.to(device)

# Optimizador y función de pérdida
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, resnet.parameters()), lr=lr)

#Comprobar que las dimensiones de salida de resnet sean consistentes con la siguiente capa
images, labels = next(iter(train_loader))
images, labels = images.to(device), labels.to(device)
image_features = resnet(images)
print(image_features.shape)

# Listas para almacenar pérdidas y métricas
train_losses = list()
val_losses = list()
train_accuracies = list()
val_accuracies = list()
train_f1_scores = list()
val_f1_scores = list()

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    all_train_labels = []
    all_train_preds = []

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        image_features = resnet(images).flatten(start_dim=1)
        outputs = model(image_features)

        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        # Calcular predicciones para métricas
        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).int()  # Umbral de 0.5 para predicciones binarias
        all_train_labels.append(labels.cpu().numpy())
        all_train_preds.append(preds.cpu().numpy())

    # Calcular métricas para entrenamiento
    all_train_labels = np.vstack(all_train_labels)
    all_train_preds = np.vstack(all_train_preds)
    train_accuracy = accuracy_score(all_train_labels, all_train_preds)
    train_f1 = f1_score(all_train_labels, all_train_preds, average="macro", zero_division=1)

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(train_accuracy)
    train_f1_scores.append(train_f1)

    # Validación
    model.eval()
    val_loss = 0.0
    all_val_labels = []
    all_val_preds = []

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            image_features = resnet(images).flatten(start_dim=1)
            outputs = model(image_features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Calcular predicciones para métricas
            probs = torch.sigmoid(outputs)
            preds = (probs > 0.5).int()  # Umbral de 0.5 para predicciones binarias
            all_val_labels.append(labels.cpu().numpy())
            all_val_preds.append(preds.cpu().numpy())

    # Calcular métricas para validación
    all_val_labels = np.vstack(all_val_labels)
    all_val_preds = np.vstack(all_val_preds)
    val_accuracy = accuracy_score(all_val_labels, all_val_preds)
    val_f1 = f1_score(all_val_labels, all_val_preds, average="macro", zero_division=1)

    val_losses.append(val_loss / len(test_loader))
    val_accuracies.append(val_accuracy)
    val_f1_scores.append(val_f1)

    # Imprimir métricas
    print(
        f"Epoch {epoch+1}/{num_epochs}, "
        f"Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}, "
        f"Train Acc: {train_accuracies[-1]:.4f}, Val Acc: {val_accuracies[-1]:.4f}, "
        f"Train F1: {train_f1_scores[-1]:.4f}, Val F1: {val_f1_scores[-1]:.4f}"
    )
    
    # Coloca el modelo en modo evaluación
model.eval()

# Listas para almacenar etiquetas reales y predicciones
all_labels = list()
all_preds = list()

# Realizar inferencia
with torch.no_grad():  # No calculamos gradientes
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        # Extraer características con ResNet y pasar por el modelo
        image_features = resnet(images).flatten(start_dim=1)
        outputs = model(image_features)

        # Aplicar sigmoid para obtener probabilidades
        probs = torch.sigmoid(outputs)

        # Convertir probabilidades en predicciones binarias (umbral = 0.5)
        preds = (probs > 0.5).int()

        # Guardar etiquetas reales y predicciones
        all_labels.append(labels.cpu().numpy())
        all_preds.append(preds.cpu().numpy())

# Convertir listas a arreglos
all_labels = np.vstack(all_labels)  # Etiquetas reales
all_preds = np.vstack(all_preds)    # Predicciones
test_accuracy = accuracy_score(all_labels, all_preds)
test_f1 = f1_score(all_labels, all_preds, average="macro", zero_division=1)
# Calcular matriz de confusión para cada etiqueta
confusion_matrices = multilabel_confusion_matrix(all_labels, all_preds)

print(f'test_accuracy: {test_accuracy}, test_f1: {test_f1}')

2451
torch.Size([32, 2048])
Epoch 1/10, Train Loss: 0.6029, Val Loss: 0.5401, Train Acc: 0.0241, Val Acc: 0.0751, Train F1: 0.0554, Val F1: 0.2089
Epoch 2/10, Train Loss: 0.5248, Val Loss: 0.5038, Train Acc: 0.0804, Val Acc: 0.0804, Train F1: 0.2676, Val F1: 0.2730
Epoch 3/10, Train Loss: 0.5032, Val Loss: 0.4878, Train Acc: 0.0375, Val Acc: 0.0161, Train F1: 0.2519, Val F1: 0.2322
Epoch 4/10, Train Loss: 0.4928, Val Loss: 0.4796, Train Acc: 0.0241, Val Acc: 0.0080, Train F1: 0.2337, Val F1: 0.2392
Epoch 5/10, Train Loss: 0.4875, Val Loss: 0.4758, Train Acc: 0.0188, Val Acc: 0.0107, Train F1: 0.2313, Val F1: 0.2461
Epoch 6/10, Train Loss: 0.4839, Val Loss: 0.4734, Train Acc: 0.0456, Val Acc: 0.0214, Train F1: 0.2746, Val F1: 0.2710
Epoch 7/10, Train Loss: 0.4827, Val Loss: 0.4717, Train Acc: 0.0563, Val Acc: 0.0483, Train F1: 0.2899, Val F1: 0.3024
Epoch 8/10, Train Loss: 0.4814, Val Loss: 0.4702, Train Acc: 0.0885, Val Acc: 0.0858, Train F1: 0.3207, Val F1: 0.3361
Epoch 9/10, Train Lo

In [3]:
import os
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
from torchvision import transforms
from torchvision.models import resnet50, resnet101, resnet152, ResNet50_Weights,ResNet101_Weights, ResNet152_Weights
from sklearn.metrics import multilabel_confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Visualizar una matriz de confusión para una etiqueta específica
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from SupportFunc.Loader import MultiLabelImageDataset
from SupportFunc.AdaptiveAttention import AdaptiveAttention, AdaptiveTransformerEncoderLayer
from SupportFunc.Visualization import plot_confusion_matrix, plot_train_val_curve

class AdaptiveAttentionClassifier(nn.Module):
    def __init__(self, image_feature_dim, num_classes):
        super(AdaptiveAttentionClassifier, self).__init__()
        self.image_embed = nn.Linear(image_feature_dim, 256)
        self.transformer_encoder = nn.ModuleList([
            AdaptiveTransformerEncoderLayer(d_model=256, nhead=8) for _ in range(1)
        ])
        self.mlp = nn.Sequential(
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes),
        )

    def forward(self, image_features):
        # Proyección de características
        image_embeddings = self.image_embed(image_features)

        # Pasar por las capas del Transformer
        for layer in self.transformer_encoder:
            image_embeddings = layer(image_embeddings.unsqueeze(1)).squeeze(1)

        # Clasificador final
        output = self.mlp(image_embeddings)
        return output

# Obtener el directorio actual
current_directory = os.getcwd()
data_directory = os.path.join(current_directory[:-3], 'data')
images_directory = os.path.join(data_directory, 'images')

num_epochs = 10
batch_size = 32
lr = 0.0001
image_feature_dim = 2048

subset = [os.path.splitext(filename)[0] for filename in os.listdir(images_directory)]
print(len(subset))

# Transformaciones para las imágenes
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Leer y filtrar datos
train_1 = pd.read_csv(f'{data_directory}/train_data.csv')
test_1 = pd.read_csv(f'{data_directory}/test_data.csv')

train_1 = train_1[train_1['ID'].isin(subset[:500])]
test_1 = test_1[test_1['ID'].isin(subset[:500])]

# Guardar los archivos filtrados
train_1.to_csv(f'{data_directory}/train_data_2.csv', encoding = 'utf-8', index=False)
train_1.to_csv(f'{data_directory}/test_data_2.csv', encoding = 'utf-8', index=False)

# Crear datasets y DataLoaders
train_dataset = MultiLabelImageDataset(csv_file=f"{data_directory}/train_data_2.csv", img_dir=f"{images_directory}/", transform=transform)
test_dataset = MultiLabelImageDataset(csv_file=f"{data_directory}/test_data_2.csv", img_dir=f"{images_directory}/", transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Modelo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet = models.resnet50(weights=ResNet50_Weights.DEFAULT)
#resnet = models.resnet101(weights=ResNet101_Weights.DEFAULT)  # Para ResNet101
#resnet = models.resnet152(weights=ResNet152_Weights.DEFAULT)  # Para ResNet152

# Congelar todas las capas inicialmente
for param in resnet.parameters():
    param.requires_grad = False

# Descongelar las capas que quieres ajustar (por ejemplo, 'layer4' o 'fc')
for param in resnet.layer4.parameters():  # Afinar el bloque final
    param.requires_grad = True
    
# Reemplazar la capa final con una capa adaptada a tu número de clases
num_classes = train_dataset[0][1].size(0)  # Obtener el número de clases desde el dataset
resnet.fc = nn.Identity()
for param in resnet.fc.parameters():
    param.requires_grad = True

resnet = resnet.to(device)

model = AdaptiveAttentionClassifier(image_feature_dim=image_feature_dim, num_classes=num_classes)
model = model.to(device)

# Optimizador y función de pérdida
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, resnet.parameters()), lr=lr)

#Comprobar que las dimensiones de salida de resnet sean consistentes con la siguiente capa
images, labels = next(iter(train_loader))
images, labels = images.to(device), labels.to(device)
image_features = resnet(images)
print(image_features.shape)

# Listas para almacenar pérdidas y métricas
train_losses = list()
val_losses = list()
train_accuracies = list()
val_accuracies = list()
train_f1_scores = list()
val_f1_scores = list()

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    all_train_labels = []
    all_train_preds = []

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        image_features = resnet(images).flatten(start_dim=1)
        outputs = model(image_features)

        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        # Calcular predicciones para métricas
        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).int()  # Umbral de 0.5 para predicciones binarias
        all_train_labels.append(labels.cpu().numpy())
        all_train_preds.append(preds.cpu().numpy())

    # Calcular métricas para entrenamiento
    all_train_labels = np.vstack(all_train_labels)
    all_train_preds = np.vstack(all_train_preds)
    train_accuracy = accuracy_score(all_train_labels, all_train_preds)
    train_f1 = f1_score(all_train_labels, all_train_preds, average="macro", zero_division=1)

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(train_accuracy)
    train_f1_scores.append(train_f1)

    # Validación
    model.eval()
    val_loss = 0.0
    all_val_labels = []
    all_val_preds = []

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            image_features = resnet(images).flatten(start_dim=1)
            outputs = model(image_features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Calcular predicciones para métricas
            probs = torch.sigmoid(outputs)
            preds = (probs > 0.5).int()  # Umbral de 0.5 para predicciones binarias
            all_val_labels.append(labels.cpu().numpy())
            all_val_preds.append(preds.cpu().numpy())

    # Calcular métricas para validación
    all_val_labels = np.vstack(all_val_labels)
    all_val_preds = np.vstack(all_val_preds)
    val_accuracy = accuracy_score(all_val_labels, all_val_preds)
    val_f1 = f1_score(all_val_labels, all_val_preds, average="macro", zero_division=1)

    val_losses.append(val_loss / len(test_loader))
    val_accuracies.append(val_accuracy)
    val_f1_scores.append(val_f1)

    # Imprimir métricas
    print(
        f"Epoch {epoch+1}/{num_epochs}, "
        f"Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}, "
        f"Train Acc: {train_accuracies[-1]:.4f}, Val Acc: {val_accuracies[-1]:.4f}, "
        f"Train F1: {train_f1_scores[-1]:.4f}, Val F1: {val_f1_scores[-1]:.4f}"
    )
    
    # Coloca el modelo en modo evaluación
model.eval()

# Listas para almacenar etiquetas reales y predicciones
all_labels = list()
all_preds = list()

# Realizar inferencia
with torch.no_grad():  # No calculamos gradientes
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        # Extraer características con ResNet y pasar por el modelo
        image_features = resnet(images).flatten(start_dim=1)
        outputs = model(image_features)

        # Aplicar sigmoid para obtener probabilidades
        probs = torch.sigmoid(outputs)

        # Convertir probabilidades en predicciones binarias (umbral = 0.5)
        preds = (probs > 0.5).int()

        # Guardar etiquetas reales y predicciones
        all_labels.append(labels.cpu().numpy())
        all_preds.append(preds.cpu().numpy())

# Convertir listas a arreglos
all_labels = np.vstack(all_labels)  # Etiquetas reales
all_preds = np.vstack(all_preds)    # Predicciones
test_accuracy = accuracy_score(all_labels, all_preds)
test_f1 = f1_score(all_labels, all_preds, average="macro", zero_division=1)
# Calcular matriz de confusión para cada etiqueta
confusion_matrices = multilabel_confusion_matrix(all_labels, all_preds)

print(f'test_accuracy: {test_accuracy}, test_f1: {test_f1}')

2451
torch.Size([32, 2048])
Epoch 1/10, Train Loss: 0.6279, Val Loss: 0.5459, Train Acc: 0.0054, Val Acc: 0.0402, Train F1: 0.0767, Val F1: 0.1211
Epoch 2/10, Train Loss: 0.5131, Val Loss: 0.4754, Train Acc: 0.0295, Val Acc: 0.0107, Train F1: 0.1879, Val F1: 0.2147
Epoch 3/10, Train Loss: 0.4701, Val Loss: 0.4493, Train Acc: 0.0000, Val Acc: 0.0000, Train F1: 0.2073, Val F1: 0.2000
Epoch 4/10, Train Loss: 0.4540, Val Loss: 0.4386, Train Acc: 0.0000, Val Acc: 0.0000, Train F1: 0.2000, Val F1: 0.2000
Epoch 5/10, Train Loss: 0.4468, Val Loss: 0.4333, Train Acc: 0.0000, Val Acc: 0.0000, Train F1: 0.2000, Val F1: 0.2000
Epoch 6/10, Train Loss: 0.4434, Val Loss: 0.4297, Train Acc: 0.0027, Val Acc: 0.0000, Train F1: 0.2061, Val F1: 0.2000
Epoch 7/10, Train Loss: 0.4400, Val Loss: 0.4270, Train Acc: 0.0000, Val Acc: 0.0000, Train F1: 0.2061, Val F1: 0.2000
Epoch 8/10, Train Loss: 0.4376, Val Loss: 0.4248, Train Acc: 0.0027, Val Acc: 0.0000, Train F1: 0.2064, Val F1: 0.2000
Epoch 9/10, Train Lo

In [4]:
import os
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
from torchvision import transforms
from torchvision.models import resnet50, resnet101, resnet152, ResNet50_Weights,ResNet101_Weights, ResNet152_Weights
from sklearn.metrics import multilabel_confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Visualizar una matriz de confusión para una etiqueta específica
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from SupportFunc.Loader import MultiLabelImageDataset
from SupportFunc.AdaptiveAttention import AdaptiveAttention, AdaptiveTransformerEncoderLayer
from SupportFunc.Visualization import plot_confusion_matrix, plot_train_val_curve

class AdaptiveAttentionClassifier(nn.Module):
    def __init__(self, image_feature_dim, num_classes):
        super(AdaptiveAttentionClassifier, self).__init__()
        self.image_embed = nn.Linear(image_feature_dim, 512)
        self.transformer_encoder = nn.ModuleList([
            AdaptiveTransformerEncoderLayer(d_model=512, nhead=8) for _ in range(1)
        ])
        self.mlp = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes),
        )

    def forward(self, image_features):
        # Proyección de características
        image_embeddings = self.image_embed(image_features)

        # Pasar por las capas del Transformer
        for layer in self.transformer_encoder:
            image_embeddings = layer(image_embeddings.unsqueeze(1)).squeeze(1)

        # Clasificador final
        output = self.mlp(image_embeddings)
        return output

# Obtener el directorio actual
current_directory = os.getcwd()
data_directory = os.path.join(current_directory[:-3], 'data')
images_directory = os.path.join(data_directory, 'images')

num_epochs = 10
batch_size = 32
lr = 0.0001
image_feature_dim = 2048

subset = [os.path.splitext(filename)[0] for filename in os.listdir(images_directory)]
print(len(subset))

# Transformaciones para las imágenes
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Leer y filtrar datos
train_1 = pd.read_csv(f'{data_directory}/train_data.csv')
test_1 = pd.read_csv(f'{data_directory}/test_data.csv')

train_1 = train_1[train_1['ID'].isin(subset[:500])]
test_1 = test_1[test_1['ID'].isin(subset[:500])]

# Guardar los archivos filtrados
train_1.to_csv(f'{data_directory}/train_data_2.csv', encoding = 'utf-8', index=False)
train_1.to_csv(f'{data_directory}/test_data_2.csv', encoding = 'utf-8', index=False)

# Crear datasets y DataLoaders
train_dataset = MultiLabelImageDataset(csv_file=f"{data_directory}/train_data_2.csv", img_dir=f"{images_directory}/", transform=transform)
test_dataset = MultiLabelImageDataset(csv_file=f"{data_directory}/test_data_2.csv", img_dir=f"{images_directory}/", transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Modelo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet = models.resnet50(weights=ResNet50_Weights.DEFAULT)
#resnet = models.resnet101(weights=ResNet101_Weights.DEFAULT)  # Para ResNet101
#resnet = models.resnet152(weights=ResNet152_Weights.DEFAULT)  # Para ResNet152

# Congelar todas las capas inicialmente
for param in resnet.parameters():
    param.requires_grad = False

# Descongelar las capas que quieres ajustar (por ejemplo, 'layer4' o 'fc')
for param in resnet.layer4.parameters():  # Afinar el bloque final
    param.requires_grad = True
    
# Reemplazar la capa final con una capa adaptada a tu número de clases
num_classes = train_dataset[0][1].size(0)  # Obtener el número de clases desde el dataset
resnet.fc = nn.Identity()
for param in resnet.fc.parameters():
    param.requires_grad = True

resnet = resnet.to(device)

model = AdaptiveAttentionClassifier(image_feature_dim=image_feature_dim, num_classes=num_classes)
model = model.to(device)

# Optimizador y función de pérdida
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, resnet.parameters()), lr=lr)

#Comprobar que las dimensiones de salida de resnet sean consistentes con la siguiente capa
images, labels = next(iter(train_loader))
images, labels = images.to(device), labels.to(device)
image_features = resnet(images)
print(image_features.shape)

# Listas para almacenar pérdidas y métricas
train_losses = list()
val_losses = list()
train_accuracies = list()
val_accuracies = list()
train_f1_scores = list()
val_f1_scores = list()

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    all_train_labels = []
    all_train_preds = []

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        image_features = resnet(images).flatten(start_dim=1)
        outputs = model(image_features)

        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        # Calcular predicciones para métricas
        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).int()  # Umbral de 0.5 para predicciones binarias
        all_train_labels.append(labels.cpu().numpy())
        all_train_preds.append(preds.cpu().numpy())

    # Calcular métricas para entrenamiento
    all_train_labels = np.vstack(all_train_labels)
    all_train_preds = np.vstack(all_train_preds)
    train_accuracy = accuracy_score(all_train_labels, all_train_preds)
    train_f1 = f1_score(all_train_labels, all_train_preds, average="macro", zero_division=1)

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(train_accuracy)
    train_f1_scores.append(train_f1)

    # Validación
    model.eval()
    val_loss = 0.0
    all_val_labels = []
    all_val_preds = []

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            image_features = resnet(images).flatten(start_dim=1)
            outputs = model(image_features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Calcular predicciones para métricas
            probs = torch.sigmoid(outputs)
            preds = (probs > 0.5).int()  # Umbral de 0.5 para predicciones binarias
            all_val_labels.append(labels.cpu().numpy())
            all_val_preds.append(preds.cpu().numpy())

    # Calcular métricas para validación
    all_val_labels = np.vstack(all_val_labels)
    all_val_preds = np.vstack(all_val_preds)
    val_accuracy = accuracy_score(all_val_labels, all_val_preds)
    val_f1 = f1_score(all_val_labels, all_val_preds, average="macro", zero_division=1)

    val_losses.append(val_loss / len(test_loader))
    val_accuracies.append(val_accuracy)
    val_f1_scores.append(val_f1)

    # Imprimir métricas
    print(
        f"Epoch {epoch+1}/{num_epochs}, "
        f"Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}, "
        f"Train Acc: {train_accuracies[-1]:.4f}, Val Acc: {val_accuracies[-1]:.4f}, "
        f"Train F1: {train_f1_scores[-1]:.4f}, Val F1: {val_f1_scores[-1]:.4f}"
    )
    
    # Coloca el modelo en modo evaluación
model.eval()

# Listas para almacenar etiquetas reales y predicciones
all_labels = list()
all_preds = list()

# Realizar inferencia
with torch.no_grad():  # No calculamos gradientes
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        # Extraer características con ResNet y pasar por el modelo
        image_features = resnet(images).flatten(start_dim=1)
        outputs = model(image_features)

        # Aplicar sigmoid para obtener probabilidades
        probs = torch.sigmoid(outputs)

        # Convertir probabilidades en predicciones binarias (umbral = 0.5)
        preds = (probs > 0.5).int()

        # Guardar etiquetas reales y predicciones
        all_labels.append(labels.cpu().numpy())
        all_preds.append(preds.cpu().numpy())

# Convertir listas a arreglos
all_labels = np.vstack(all_labels)  # Etiquetas reales
all_preds = np.vstack(all_preds)    # Predicciones
test_accuracy = accuracy_score(all_labels, all_preds)
test_f1 = f1_score(all_labels, all_preds, average="macro", zero_division=1)
# Calcular matriz de confusión para cada etiqueta
confusion_matrices = multilabel_confusion_matrix(all_labels, all_preds)

print(f'test_accuracy: {test_accuracy}, test_f1: {test_f1}')

2451
torch.Size([32, 2048])
Epoch 1/10, Train Loss: 0.6102, Val Loss: 0.5118, Train Acc: 0.0080, Val Acc: 0.0268, Train F1: 0.0755, Val F1: 0.0368
Epoch 2/10, Train Loss: 0.4709, Val Loss: 0.4206, Train Acc: 0.0107, Val Acc: 0.0000, Train F1: 0.1776, Val F1: 0.2033
Epoch 3/10, Train Loss: 0.4068, Val Loss: 0.3789, Train Acc: 0.0000, Val Acc: 0.0000, Train F1: 0.2033, Val F1: 0.2033
Epoch 4/10, Train Loss: 0.3807, Val Loss: 0.3616, Train Acc: 0.0000, Val Acc: 0.0000, Train F1: 0.2000, Val F1: 0.2033
Epoch 5/10, Train Loss: 0.3688, Val Loss: 0.3522, Train Acc: 0.0000, Val Acc: 0.0000, Train F1: 0.2000, Val F1: 0.2000
Epoch 6/10, Train Loss: 0.3617, Val Loss: 0.3458, Train Acc: 0.0000, Val Acc: 0.0000, Train F1: 0.2000, Val F1: 0.2000
Epoch 7/10, Train Loss: 0.3562, Val Loss: 0.3412, Train Acc: 0.0000, Val Acc: 0.0000, Train F1: 0.2000, Val F1: 0.2000
Epoch 8/10, Train Loss: 0.3527, Val Loss: 0.3374, Train Acc: 0.0000, Val Acc: 0.0000, Train F1: 0.2000, Val F1: 0.2000
Epoch 9/10, Train Lo