In [None]:
import torch
import clip
from PIL import Image
import os

In [None]:
# Verificar si tenemos GPU disponible
device = "cuda" if torch.cuda.is_available() else "cpu"

# Cargar el modelo CLIP
model, preprocess = clip.load("ViT-B/32", device=device)

# Ruta de la imagen de prueba
image_path = "../data/raw/train"
image_files = [f for f in os.listdir(image_path) if f.endswith((".jpg", ".png"))]

if not image_files:
    print("❌ No images found in folder:", image_path)
    exit()

In [None]:
image_file = os.path.join(image_path, image_files[0])  # Select the first image

# Cargar y preprocesar la imagen
image = preprocess(Image.open(image_file)).unsqueeze(0).to(device)

# Definir posibles descripciones mejoradas en inglés
text_descriptions = [
    "Dent on the fuselage surface",
    "Crack on the wing structure",
    "Corrosion detected in the engine",
    "Superficial scratch on the body",
    "Impact mark from a foreign object",
    "Severe structural damage observed"
]

In [None]:
# Tokenizar descripciones
text_tokens = clip.tokenize(text_descriptions).to(device)


In [None]:
# Calcular similitud entre la imagen y las descripciones
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_tokens)
    similarity = (image_features @ text_features.T).softmax(dim=-1)


In [None]:
# Seleccionar la descripción más probable
best_match = text_descriptions[similarity.argmax()]

In [None]:
# Mostrar resultado
print(f"✅ Predicted description for {image_file}: {best_match}")