# Creazione Embedding con FLUX CLIP Text Encoder

Questo notebook crea embedding usando il text encoder di CLIP dal modello FLUX di Hugging Face con prompt vuoto.

## 1. Installazione delle dipendenze

In [None]:
!pip install transformers torch diffusers accelerate

## 2. Import delle librerie

In [None]:
import torch
from transformers import CLIPTextModel, CLIPTokenizer
import numpy as np

## 3. Caricamento del modello CLIP Text Encoder da FLUX

In [None]:
# Definisci il device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Carica il text encoder e tokenizer da FLUX
model_id = "black-forest-labs/FLUX.1-dev"  # o "black-forest-labs/FLUX.1-schnell"

print("Loading CLIP text encoder...")
text_encoder = CLIPTextModel.from_pretrained(
    model_id,
    subfolder="text_encoder",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
)
text_encoder.to(device)
text_encoder.eval()

print("Loading tokenizer...")
tokenizer = CLIPTokenizer.from_pretrained(
    model_id,
    subfolder="tokenizer"
)

print("Model loaded successfully!")

## 4. Creazione degli embedding con prompt vuoto

In [None]:
# Prompt vuoto
prompt = ""

# Tokenizza il prompt
text_inputs = tokenizer(
    prompt,
    padding="max_length",
    max_length=tokenizer.model_max_length,
    truncation=True,
    return_tensors="pt"
)

text_input_ids = text_inputs.input_ids.to(device)

# Genera gli embedding
with torch.no_grad():
    text_embeddings = text_encoder(
        text_input_ids,
        output_hidden_states=True
    )
    
    # Ottieni gli embedding finali
    prompt_embeds = text_embeddings.last_hidden_state
    pooled_embeds = text_embeddings.pooler_output

print(f"Prompt embeddings shape: {prompt_embeds.shape}")
print(f"Pooled embeddings shape: {pooled_embeds.shape}")

## 5. Visualizzazione delle informazioni sugli embedding

In [None]:
# Converti in numpy per analisi
prompt_embeds_np = prompt_embeds.cpu().numpy()
pooled_embeds_np = pooled_embeds.cpu().numpy()

print("=== Informazioni sugli Embedding ===")
print(f"\nPrompt embeddings:")
print(f"  Shape: {prompt_embeds_np.shape}")
print(f"  Mean: {prompt_embeds_np.mean():.6f}")
print(f"  Std: {prompt_embeds_np.std():.6f}")
print(f"  Min: {prompt_embeds_np.min():.6f}")
print(f"  Max: {prompt_embeds_np.max():.6f}")

print(f"\nPooled embeddings:")
print(f"  Shape: {pooled_embeds_np.shape}")
print(f"  Mean: {pooled_embeds_np.mean():.6f}")
print(f"  Std: {pooled_embeds_np.std():.6f}")
print(f"  Min: {pooled_embeds_np.min():.6f}")
print(f"  Max: {pooled_embeds_np.max():.6f}")

## 6. Salvataggio degli embedding

In [None]:
# Salva gli embedding in formato numpy
np.save("flux_clip_empty_prompt_embeds.npy", prompt_embeds_np)
np.save("flux_clip_empty_pooled_embeds.npy", pooled_embeds_np)

# Salva anche in formato PyTorch
torch.save({
    'prompt_embeds': prompt_embeds,
    'pooled_embeds': pooled_embeds
}, "flux_clip_empty_embeddings.pt")

print("Embedding salvati con successo!")
print("  - flux_clip_empty_prompt_embeds.npy")
print("  - flux_clip_empty_pooled_embeds.npy")
print("  - flux_clip_empty_embeddings.pt")

## 7. (Opzionale) Confronto con prompt non vuoto

In [None]:
# Confronta con un prompt di esempio
test_prompt = "a beautiful landscape"

test_inputs = tokenizer(
    test_prompt,
    padding="max_length",
    max_length=tokenizer.model_max_length,
    truncation=True,
    return_tensors="pt"
).input_ids.to(device)

with torch.no_grad():
    test_embeddings = text_encoder(test_inputs)
    test_embeds = test_embeddings.last_hidden_state

# Calcola la distanza tra empty prompt e test prompt
distance = torch.norm(prompt_embeds - test_embeds).item()
print(f"Distanza L2 tra prompt vuoto e '{test_prompt}': {distance:.4f}")

## 8. (Opzionale) Funzione riutilizzabile

In [None]:
def get_clip_embeddings(prompt, text_encoder, tokenizer, device):
    """
    Genera embedding CLIP per un dato prompt.
    
    Args:
        prompt (str): Il prompt testuale
        text_encoder: Il modello CLIP text encoder
        tokenizer: Il tokenizer CLIP
        device: Il device (cuda/cpu)
    
    Returns:
        tuple: (prompt_embeds, pooled_embeds)
    """
    text_inputs = tokenizer(
        prompt,
        padding="max_length",
        max_length=tokenizer.model_max_length,
        truncation=True,
        return_tensors="pt"
    )
    
    text_input_ids = text_inputs.input_ids.to(device)
    
    with torch.no_grad():
        text_embeddings = text_encoder(
            text_input_ids,
            output_hidden_states=True
        )
        prompt_embeds = text_embeddings.last_hidden_state
        pooled_embeds = text_embeddings.pooler_output
    
    return prompt_embeds, pooled_embeds

# Esempio di utilizzo
empty_embeds, empty_pooled = get_clip_embeddings("", text_encoder, tokenizer, device)
print(f"Embedding generati: {empty_embeds.shape}")