In [None]:
import torch
import clip
from PIL import Image
import os
import time
import re
import matplotlib.pyplot as plt

In [None]:
# Verificar si tenemos GPU disponible
device = "cuda" if torch.cuda.is_available() else "cpu"

# Cargar el modelo CLIP
model, preprocess = clip.load("ViT-B/32", device=device)

# Ruta de las imágenes
image_path = "../data/raw/train"
image_files = [f for f in os.listdir(image_path) if f.endswith((".jpg", ".png"))]

if len(image_files) < 5:
    print("❌ Not enough images in the folder for testing (minimum 5 required).")
    exit()


In [None]:
# Seleccionar 5 imágenes
selected_images = image_files[:5]

# Definir posibles descripciones mejoradas en inglés
text_descriptions = [
    "Dent on the fuselage surface",
    "Crack on the wing structure",
    "Corrosion detected in the engine",
    "Superficial scratch on the body",
    "Impact mark from a foreign object",
    "Severe structural damage observed"
]

# Tokenizar descripciones
text_tokens = clip.tokenize(text_descriptions).to(device)

In [None]:
# Función para extraer palabras clave del nombre del archivo
def extract_keywords(filename):
    filename = filename.replace("_", " ").replace("-", " ")  # Normalize separators
    keywords = re.findall(r'\b(Cessna|Beechcraft|Boeing|Wing|Structure|Rudder|Dent|Damage|Trim|Assembly|Fuselage|Hailstorm|Minus|Core)\b', filename, re.IGNORECASE)
    return " ".join(set(keywords)) if keywords else "Unknown"

In [None]:
# Medir tiempo de ejecución
start_time = time.time()

# Procesar imágenes
results = []
for img_name in selected_images:
    img_path = os.path.join(image_path, img_name)
    image = Image.open(img_path)
    image_preprocessed = preprocess(image).unsqueeze(0).to(device)

    # Extraer palabras clave del nombre del archivo
    file_keywords = extract_keywords(img_name)

    # Ajustar las descripciones de CLIP con la información extraída
    enhanced_descriptions = [f"{desc} ({file_keywords})" for desc in text_descriptions]

    # Tokenizar nuevas descripciones
    enhanced_text_tokens = clip.tokenize(enhanced_descriptions).to(device)

    # Calcular similitud entre la imagen y las descripciones
    with torch.no_grad():
        image_features = model.encode_image(image_preprocessed)
        text_features = model.encode_text(enhanced_text_tokens)
        similarity = (image_features @ text_features.T).softmax(dim=-1)

    # Seleccionar la descripción más probable
    best_match = enhanced_descriptions[similarity.argmax()]
    
    # Guardar resultado
    results.append((img_name, best_match))

    # Mostrar imagen con su predicción
    plt.figure(figsize=(6, 6))
    plt.imshow(image)
    plt.axis("off")
    plt.title(f"Generated description: {best_match}", fontsize=12, color="blue")
    plt.show()

# Medir tiempo final
end_time = time.time()
execution_time = end_time - start_time

# Imprimir resultados en la terminal
print("\n✅ Test results with 5 images (enhanced with filename information):")
for img_name, desc in results:
    print(f"📌 {img_name}: {desc}")

print(f"\n⏳ Total execution time: {execution_time:.2f} seconds")