Este código se utiliza para procesar las imágenes originales de los experimentos y obtener como resultado dos imagenes, 
en una de ellas se encuentran los mapas de calor por cabeza (son 12 cabezas que usa el transformer) y en la otra imagen
encontramos el mapa de calor del promedio de la atención del ViT en las 12 cabezas también entrega la mediana de la atención

In [1]:
import os
import math
from functools import partial
import torch
import torch.nn as nn
from PIL import Image
from torchvision import transforms
import matplotlib.pyplot as plt
import numpy as np
import gc
import psutil
import warnings
from torch import Tensor
from scipy.stats import mode

warnings.filterwarnings("ignore")

#  📌 Cargar imagenes

Antes de definir el modelo, cargamos las imágenes y realizamos preprocesamiento

In [2]:
folder_path = "C:/Users/UsuarioCompuElite/Desktop/Tesis_doctorado/Articulo_1/metodologia/imagenes_experimento_001"

# Obtener la lista de imágenes disponibles
image_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".jpg")]

# Función para preprocesar imágenes a tensores
def transform(img, img_size):
    img = transforms.Resize(img_size)(img)
    img = transforms.ToTensor()(img)
    return img

# Procesar todas las imágenes y almacenarlas como tensores
processed_images = []
for img_path in image_files:
    img = Image.open(img_path)
    img_tensor = transform(img, (224, 224))  # Redimensionar a 224x224
    processed_images.append(img_tensor)

print(f"[INFO] Se cargaron {len(processed_images)} imágenes correctamente.")

[INFO] Se cargaron 20 imágenes correctamente.


# Configuración del Entorno

Configuración del entorno para ejecutar el modelo en GPU (si está disponible) o en CPU

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"[INFO] Usando dispositivo: {device}")

[INFO] Usando dispositivo: cpu


# Definición de Funciones Auxiliares

In [4]:
# Función de truncamiento normal para inicialización de pesos
def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
    return _no_grad_trunc_normal_(tensor, mean, std, a, b)

def _no_grad_trunc_normal_(tensor, mean, std, a, b):
    def norm_cdf(x):
        return (1. + math.erf(x / math.sqrt(2.))) / 2.

# Función DropPath
def drop_path(x, drop_prob: float = 0., training: bool = False):
    if drop_prob == 0. or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0],) + (1,) * (x.ndim - 1)
    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
    random_tensor.floor_()
    output = x.div(keep_prob) * random_tensor
    return output

# Definición del Modelo Vision Transformer

Se implementan los bloques principales del modelo ViT

In [5]:

class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """

    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)


class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C //
                                  self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x, attn


class Block(nn.Module):
    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim,
                       act_layer=act_layer, drop=drop)

    def forward(self, x, return_attention=False):
        y, attn = self.attn(self.norm1(x))
        if return_attention:
            return attn
        x = x + self.drop_path(y)
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x


class PatchEmbed(nn.Module):
    """ 
    Image to Patch Embedding
    """

    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()
        num_patches = (img_size // patch_size) * (img_size // patch_size)
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        self.proj = nn.Conv2d(in_chans, embed_dim,
                              kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        B, C, H, W = x.shape
        x = self.proj(x).flatten(2).transpose(1, 2)
        return x


class VisionTransformer(nn.Module):
    """ Vision Transformer """

    def __init__(self, img_size=[224], patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12,
                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
                 drop_path_rate=0., norm_layer=nn.LayerNorm, **kwargs):
        super().__init__()
        self.num_features = self.embed_dim = embed_dim

        self.patch_embed = PatchEmbed(
            img_size=img_size[0], patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(
            torch.zeros(1, num_patches + 1, embed_dim))
        self.pos_drop = nn.Dropout(p=drop_rate)

        # stochastic depth decay rule
        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
        self.blocks = nn.ModuleList([
            Block(
                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
            for i in range(depth)])
        self.norm = norm_layer(embed_dim)

        # Classifier head
        self.head = nn.Linear(
            embed_dim, num_classes) if num_classes > 0 else nn.Identity()

        trunc_normal_(self.pos_embed, std=.02)
        trunc_normal_(self.cls_token, std=.02)
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def interpolate_pos_encoding(self, x, w, h):
        npatch = x.shape[1] - 1
        N = self.pos_embed.shape[1] - 1
        if npatch == N and w == h:
            return self.pos_embed
        class_pos_embed = self.pos_embed[:, 0]
        patch_pos_embed = self.pos_embed[:, 1:]
        dim = x.shape[-1]
        w0 = w // self.patch_embed.patch_size
        h0 = h // self.patch_embed.patch_size
        # we add a small number to avoid floating point error in the interpolation
        # see discussion at https://github.com/facebookresearch/dino/issues/8
        w0, h0 = w0 + 0.1, h0 + 0.1
        patch_pos_embed = nn.functional.interpolate(
            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(
                math.sqrt(N)), dim).permute(0, 3, 1, 2),
            scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
            mode='bicubic',
        )
        assert int(
            w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)

    def prepare_tokens(self, x):
        B, nc, w, h = x.shape
        x = self.patch_embed(x)  # patch linear embedding

        # add the [CLS] token to the embed patch tokens
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)

        # add positional encoding to each token
        x = x + self.interpolate_pos_encoding(x, w, h)

        return self.pos_drop(x)

    def forward(self, x):
        x = self.prepare_tokens(x)
        for blk in self.blocks:
            x = blk(x)
        x = self.norm(x)
        return x[:, 0]

    def get_last_selfattention(self, x):
        x = self.prepare_tokens(x)
        for i, blk in enumerate(self.blocks):
            if i < len(self.blocks) - 1:
                x = blk(x)
            else:
                # return attention of the last block
                return blk(x, return_attention=True)

    def get_intermediate_layers(self, x, n=1):
        x = self.prepare_tokens(x)
        # we return the output tokens from the `n` last blocks
        output = []
        for i, blk in enumerate(self.blocks):
            x = blk(x)
            if len(self.blocks) - i <= n:
                output.append(self.norm(x))
        return output


class VitGenerator(object):
    def __init__(self, name_model, patch_size, device, evaluate=True, random=False, verbose=False):
        self.name_model = name_model
        self.patch_size = patch_size
        self.evaluate = evaluate
        self.device = device
        self.verbose = verbose
        self.model = self._getModel()
        self._initializeModel()
        if not random:
            self._loadPretrainedWeights()

    def _getModel(self):
        if self.verbose:
            print(
                f"[INFO] Initializing {self.name_model} with patch size of {self.patch_size}")
        if self.name_model == 'vit_tiny':
            model = VisionTransformer(patch_size=self.patch_size, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4,
                                      qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6))

        elif self.name_model == 'vit_small':
            model = VisionTransformer(patch_size=self.patch_size, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4,
                                      qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6))

        elif self.name_model == 'vit_base':
            model = VisionTransformer(patch_size=self.patch_size, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
                                      qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6))
        else:
            raise f"No model found with {self.name_model}"

        return model

    def _initializeModel(self):
        if self.evaluate:
            for p in self.model.parameters():
                p.requires_grad = False

            self.model.eval()

        self.model.to(self.device)

    def _loadPretrainedWeights(self):
        if self.verbose:
            print("[INFO] Loading weights")
        url = None
        if self.name_model == 'vit_small' and self.patch_size == 16:
            url = "dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth"

        elif self.name_model == 'vit_small' and self.patch_size == 8:
            url = "dino_deitsmall8_300ep_pretrain/dino_deitsmall8_300ep_pretrain.pth"

        elif self.name_model == 'vit_base' and self.patch_size == 16:
            url = "dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth"

        elif self.name_model == 'vit_base' and self.patch_size == 8:
            url = "dino_vitbase8_pretrain/dino_vitbase8_pretrain.pth"

        if url is None:
            print(
                f"Since no pretrained weights have been found with name {self.name_model} and patch size {self.patch_size}, random weights will be used")

        else:
            state_dict = torch.hub.load_state_dict_from_url(
                url="https://dl.fbaipublicfiles.com/dino/" + url)
            self.model.load_state_dict(state_dict, strict=True)

    def get_last_selfattention(self, img):
        return self.model.get_last_selfattention(img.to(self.device))

    def __call__(self, x):
        return self.model(x)

# Transformación y visualización de las imagenes


In [6]:
def transform(img, img_size):
    img = transforms.Resize(img_size)(img)
    img = transforms.ToTensor()(img)
    return img

# Visualización de atención
def visualize_attention(model, img, patch_size, device):
    w, h = img.shape[1] - img.shape[1] % patch_size, img.shape[2] - img.shape[2] % patch_size
    img = img[:, :w, :h].unsqueeze(0)

    w_featmap = img.shape[-2] // patch_size
    h_featmap = img.shape[-1] // patch_size

    attentions = model.get_last_selfattention(img.to(device))

    nh = attentions.shape[1]
    attentions = attentions[0, :, 0, 1:].reshape(nh, -1)
    attentions = attentions.reshape(nh, w_featmap, h_featmap)
    attentions = torch.nn.functional.interpolate(
        attentions.unsqueeze(0), scale_factor=patch_size, mode="nearest"
    )[0].cpu().numpy()
    return attentions


# Gráficos de los módulos atención

In [7]:
# Guardar gráficos de atención
def plot_attention(img, attention, path):
    n_heads = attention.shape[0]
    
    # Extraer el nombre de la carpeta basado en `path`
    base_name = os.path.basename(path).split(".")[0]  # Por ejemplo, "cesteria_01"
    
    # Crear el directorio de salida correcto
    output_dir = os.path.join(
        "C:/Users/UsuarioCompuElite/Desktop/Tesis_doctorado/Articulo_1/metodologia/Resultados_vit_experimento_001",
        base_name,
        "csv"
    )
    os.makedirs(output_dir, exist_ok=True)
    print(f"[INFO] Directorio de salida: {output_dir}")

    # **1. Visualización de la imagen original y la mediana de atención**
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.imshow(img)
    plt.title("Original Image")
    plt.axis("off")

    plt.subplot(1, 2, 2)
    attention_median = np.median(attention, axis=0)  # Calcular la mediana
    plt.imshow(attention_median, cmap='jet')
    plt.title("Attention Median")
    plt.axis("off")

    # Guardar la mediana como CSV
    median_csv_path = os.path.join(output_dir, f"{base_name}_attention_median.csv")
    np.savetxt(median_csv_path, attention_median, delimiter=',')
    print(f"[INFO] CSV de mediana guardado en: {median_csv_path}")

    # Guardar la visualización de la mediana como imagen
    median_output_path = f"{path}_attention_median.png"
    plt.savefig(median_output_path, bbox_inches='tight')
    print(f"[INFO] Imagen de mediana guardada en: {median_output_path}")
    plt.close()

    # **2. Visualización de la imagen original y el promedio de atención**
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.imshow(img)
    plt.title("Original Image")
    plt.axis("off")

    plt.subplot(1, 2, 2)
    attention_mean = np.mean(attention, axis=0)  # Calcular la media
    plt.imshow(attention_mean, cmap='jet')
    plt.title("Attention Mean")
    plt.axis("off")

    # Guardar el promedio como CSV
    mean_csv_path = os.path.join(output_dir, f"{base_name}_attention_mean.csv")
    np.savetxt(mean_csv_path, attention_mean, delimiter=',')
    print(f"[INFO] CSV de promedio guardado en: {mean_csv_path}")

    # Guardar la visualización del promedio como imagen
    mean_output_path = f"{path}_attention_mean.png"
    plt.savefig(mean_output_path, bbox_inches='tight')
    print(f"[INFO] Imagen de promedio guardada en: {mean_output_path}")
    plt.close()

    # **3. Visualización de los mapas de atención por cabeza**
    plt.figure(figsize=(10, 10))
    for i in range(n_heads):
        plt.subplot((n_heads + 2) // 3, 3, i + 1)
        plt.imshow(img)
        plt.imshow(attention[i], cmap='jet', alpha=0.6)
        plt.title(f"Head {i+1}")
        plt.axis("off")

        # Guardar cada mapa de atención individual como CSV
        head_csv_path = os.path.join(output_dir, f"{base_name}_attention_h{i}.csv")
        np.savetxt(head_csv_path, attention[i], delimiter=',')
        print(f"[INFO] CSV de atención de la cabeza {i+1} guardado en: {head_csv_path}")

    # Guardar la visualización de las cabezas como imagen
    heads_output_path = f"{path}_attention_heads.png"
    plt.savefig(heads_output_path, bbox_inches='tight')
    print(f"[INFO] Imagen de cabezas guardada en: {heads_output_path}")
    plt.close()
    
# Visualización de predicciones
def visualize_predict(model, img, img_size, patch_size, device, path):
    img_pre = transform(img, img_size)
    attention = visualize_attention(model, img_pre, patch_size, device)
    plot_attention(img, attention, path)

# Inicialización del dispositivo y modelo

In [8]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if device.type == "cuda":
    torch.cuda.set_device(0)

name_model = 'vit_base'
patch_size = 16

model = VitGenerator(name_model, patch_size, device, evaluate=True, random=False, verbose=True)

[INFO] Initializing vit_base with patch size of 16
[INFO] Loading weights


# Procesar todas las imágenes en la carpeta

## carpeta para guardar los resultados

In [9]:
output_folder = r"C:\Users\UsuarioCompuElite\Desktop\Tesis_doctorado\Articulo_1\metodologia\Resultados_articulo1\resultados_heatmap_ViT"
os.makedirs(output_folder, exist_ok=True)  # Crear carpeta si no existe

In [10]:
# Procesar una imagen a la vez
def process_image(file_path):
    img = Image.open(file_path)
    try:
        print(f"[INFO] Procesando imagen: {os.path.basename(file_path)}")
        print(f"[INFO] Memoria RAM utilizada antes: {psutil.virtual_memory().used / (1024 ** 3):.2f} GB")

        # Transformar la imagen a formato esperado (PIL -> Tensor)
        img_size = tuple(np.array(img.size[::-1]))  # Tamaño original de la imagen
        img_tensor = transform(img, img_size)  # Transformar PIL -> Tensor

        # Llamar a la función de procesamiento con el modelo
        output_base = os.path.splitext(file_path)[0]
        visualize_predict(model, img, img_size, patch_size, device, output_base)

        print(f"[INFO] Memoria RAM utilizada después: {psutil.virtual_memory().used / (1024 ** 3):.2f} GB")

    except Exception as e:
        print(f"[ERROR] Error procesando la imagen {os.path.basename(file_path)}: {e}")

    finally:
        # Liberar recursos asociados a la imagen
        img.close()
        del img
        del img_tensor

        # Liberar memoria de GPU y RAM
        if device.type == "cuda":
            torch.cuda.empty_cache()
            torch.cuda.synchronize()

        # Forzar recolección de basura
        gc.collect()

# Procesar imágenes una por una
def main():
    for filename in os.listdir(output_folder):
        if filename.endswith(".jpg"):
            file_path = os.path.join(output_folder, filename)
            process_image(file_path)

print("[INFO] Procesamiento completo.")

if __name__ == "__main__":
    main()

[INFO] Procesamiento completo.


# Calcular la media y la desviación estándar para las distribuciones de los heatmap de cada imagen procesada por el ViT

In [2]:
import os
import numpy as np
import pandas as pd

# Lista de nombres de imágenes
imagenes = [
    "basketry_01", "basketry_02", "basketry_03", "basketry_04", "basketry_05",
    "basketry_06", "basketry_07", "basketry_08", "basketry_09", "basketry_10",
    "jar_01", "jar_02", "jar_03", "jar_04", "jar_05",
    "jar_06", "jar_07", "jar_08", "jar_09", "jar_10",
]

imagenes2 = [
    "cesteria_01", "cesteria_02", "cesteria_03", "cesteria_04", "cesteria_05",
    "cesteria_06", "cesteria_07", "cesteria_08", "cesteria_09", "cesteria_10",
    "jarra_01", "jarra_02", "jarra_03", "jarra_04", "jarra_05",
    "jarra_06", "jarra_07", "jarra_08", "jarra_09", "jarra_10"
]

# Ruta base donde están los CSVs de atención del ViT
base_vit_path = r"C:/Users/UsuarioCompuElite/Desktop/Tesis_doctorado/Articulo_1/metodologia/Resultados_vit_experimento_001"

# Lista para guardar estadísticas
estadisticas_vit = []

for img in imagenes:
    try:
        csv_path = os.path.join(base_vit_path, img, "normalized_csv", f"{img}_attention_mean_normalized.csv")
        data = np.loadtxt(csv_path, delimiter=",")
        media = np.mean(data)
        desviacion = np.std(data)

        estadisticas_vit.append({
            "Imagen": img,
            "Media": round(media, 6),
            "Desviación estándar": round(desviacion, 6)
        })

    except Exception as e:
        print(f"❌ Error con {img}: {e}")

# Crear DataFrame ordenado
df_vit = pd.DataFrame(estadisticas_vit).sort_values("Imagen").reset_index(drop=True)

# Mostrar tabla en consola
print("\n" + "="*65)
print("         DISTRIBUCIÓN_HEATMAP_ViT_ATTENTION_MEAN")
print("="*65)
print(df_vit.to_string(index=False))

# Guardar como CSV
output_csv = os.path.join(base_vit_path, "estadisticas_heatmaps_ViT.csv")
df_vit.to_csv(output_csv, index=False)
print(f"\n✅ CSV guardado en:\n{output_csv}")


❌ Error con basketry_01: C:/Users/UsuarioCompuElite/Desktop/Tesis_doctorado/Articulo_1/metodologia/Resultados_vit_experimento_001\basketry_01\normalized_csv\basketry_01_attention_mean_normalized.csv not found.
❌ Error con basketry_02: C:/Users/UsuarioCompuElite/Desktop/Tesis_doctorado/Articulo_1/metodologia/Resultados_vit_experimento_001\basketry_02\normalized_csv\basketry_02_attention_mean_normalized.csv not found.
❌ Error con basketry_03: C:/Users/UsuarioCompuElite/Desktop/Tesis_doctorado/Articulo_1/metodologia/Resultados_vit_experimento_001\basketry_03\normalized_csv\basketry_03_attention_mean_normalized.csv not found.
❌ Error con basketry_04: C:/Users/UsuarioCompuElite/Desktop/Tesis_doctorado/Articulo_1/metodologia/Resultados_vit_experimento_001\basketry_04\normalized_csv\basketry_04_attention_mean_normalized.csv not found.
❌ Error con basketry_05: C:/Users/UsuarioCompuElite/Desktop/Tesis_doctorado/Articulo_1/metodologia/Resultados_vit_experimento_001\basketry_05\normalized_csv\bas

KeyError: 'Imagen'