# Latent Space Mining

StyleGAN3’s latent space (Z) is unstructured. We automated a mining process using OpenAI CLIP to map it.

CLIP prompts:
1. "intense fiery red and orange colors, aggressive sharp jagged shapes, high energy dynamic movement"
2. "peaceful soft blue and teal colors, calm static atmosphere, smooth blurry gradients"
3. "vibrant multicolored geometric lines, energetic neon patterns, sharp contrast"   

The generated dataset corresponds to [vettore_latente,score_prompt_1,score_prompt_2,score_prompt_3]

In [None]:
import os
import sys
import torch
import clip
import pickle
import numpy as np
from PIL import Image
from tqdm import tqdm

# FIX PATH: Add root directory to system path to allow imports from src
current_dir = os.path.dirname(os.path.abspath(__file__))
root_path = os.path.abspath(os.path.join(current_dir, '..'))
if root_path not in sys.path:
    sys.path.insert(0, root_path)

from src.gan_manager import GANManager

def run_miner():
    """
    Executes the Latent Space Mining process.
    
    This function generates random images using the GAN, analyzes them using 
    CLIP (Contrastive Language-Image Pre-Training) against specific text prompts, 
    and saves a dataset mapping latent vectors 'z' to their aesthetic scores.
    """
    MODEL_PATH = './resources/network-snapshot-000280.pkl'
    OUTPUT_PATH = './data/dataset_clip.pkl'
    NUM_SAMPLES = 5000 
    
    # DEFINITION OF 3 CHROMATIC MOODS (Text Prompts)
    PROMPTS = [
        "intense fiery red and orange colors, aggressive sharp jagged shapes, high energy dynamic movement", # RED/INTENSE
        "peaceful soft blue and teal colors, calm static atmosphere, smooth blurry gradients",          # BLUE/CALM
        "vibrant multicolored geometric lines, energetic neon patterns, sharp contrast"                 # INTERMEDIATE/LINES
    ]
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"[*] Miner started on: {device}")

    # Load CLIP model
    model_clip, preprocess = clip.load("ViT-B/32", device=device)
    text_tokens = clip.tokenize(PROMPTS).to(device)
    
    # Initialize GAN
    gan = GANManager(MODEL_PATH, use_gpu=True)
    
    dataset = []

    print(f"[*] Analyzing {NUM_SAMPLES} images...")
    
    for i in tqdm(range(NUM_SAMPLES)):
        # Generate a random latent vector z
        z = torch.randn(1, gan.latent_dim).to(device)
        
        with torch.no_grad():
            # Generate image from the GAN
            # Note: Ensure gan.generate_image is using the correct input (z vs seed)
            img_np = gan.generate_image(np.zeros(1024), None) 
            img_pil = Image.fromarray(img_np)
            
            # Preprocess for CLIP
            image_input = preprocess(img_pil).unsqueeze(0).to(device)
            
            # Calculate similarity scores (logits) between image and text prompts
            logits_per_image, _ = model_clip(image_input, text_tokens)
            probs = logits_per_image.softmax(dim=-1).cpu().numpy()[0]
            
            # Append latent vector and corresponding scores to dataset
            dataset.append({'z': z.cpu().numpy(), 'scores': probs})

    # Save the dataset
    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
    with open(OUTPUT_PATH, 'wb') as f:
        pickle.dump({'prompts': PROMPTS, 'data': dataset}, f)
    
    print(f"[✔] Mining completed on 3 classes! Saved to {OUTPUT_PATH}")

# MLP TRAINING

A custom Multi-Layer Perceptron (MLP) is used to act as a translator between audio features and visual moods.<br>
The training is unsupervised using the CLIP dataset as Ground Truth.
<br><br>
Empirically observed min/max values for audio features to simulate realistic inputs.
```python
RANGES = {
    'spectral_contrast':   {'min': (10.0, 13.03), 'med': (12.0, 14.19), 'max': (13.8, 15.33)},
    'spectral_flatness':   {'min': (0.0012, 0.0041), 'med': (0.0028, 0.0090), 'max': (0.0049, 0.0154)},
    'onset_strength':      {'min': (0.207, 0.502), 'med': (0.404, 0.738), 'max': (0.632, 0.869)},
    'zero_crossing_rate':  {'min': (0.0051, 0.0143), 'med': (0.0173, 0.0455), 'max': (0.0375, 0.0666)},
    'chroma_variance':     {'min': (0.0045, 0.0166), 'med': (0.0095, 0.0221), 'max': (0.0126, 0.0267)}
}
```

In [None]:
import os
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# --- ARCHITECTURE (Synchronized with mlp_manager.py) ---
class MoodMLP(nn.Module):
    """
    Multi-Layer Perceptron (MLP) to map audio features to GAN latent vectors (w/z).
    
    Args:
        input_size (int): Number of audio features (default: 5).
        output_size (int): Dimension of the GAN latent space (default: 512).
    """
    def __init__(self, input_size=5, output_size=512):
        super(MoodMLP, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, 64), nn.ReLU(), nn.BatchNorm1d(64),
            nn.Linear(64, 128), nn.ReLU(), nn.BatchNorm1d(128),
            nn.Linear(128, 256), nn.ReLU(),
            nn.Linear(256, output_size)
        )

    def forward(self, x): 
        return self.network(x)

# --- REAL AUDIO FEATURE RANGES ---
# Empirically observed min/max values for audio features to simulate realistic inputs.
RANGES = {
    'spectral_contrast':   {'min': (10.0, 13.03), 'med': (12.0, 14.19), 'max': (13.8, 15.33)},
    'spectral_flatness':   {'min': (0.0012, 0.0041), 'med': (0.0028, 0.0090), 'max': (0.0049, 0.0154)},
    'onset_strength':      {'min': (0.207, 0.502), 'med': (0.404, 0.738), 'max': (0.632, 0.869)},
    'zero_crossing_rate':  {'min': (0.0051, 0.0143), 'med': (0.0173, 0.0455), 'max': (0.0375, 0.0666)},
    'chroma_variance':     {'min': (0.0045, 0.0166), 'med': (0.0095, 0.0221), 'max': (0.0126, 0.0267)}
}

def get_feat(name, level):
    """Returns a random float within the specified feature range level."""
    low, high = RANGES[name][level]
    return random.uniform(low, high)

def train_mlp():
    """
    Trains the MLP to associate specific audio characteristics with visual moods.
    
    The logic maps:
    1. High Energy Audio -> Red/Aggressive Visuals
    2. Low Energy Audio  -> Blue/Calm Visuals
    3. Medium Energy     -> Vibrant/Geometric Visuals
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load the CLIP-mined dataset containing (latent_vector, scores)
    with open('./data/dataset_clip.pkl', 'rb') as f:
        raw_data = pickle.load(f)
    
    samples = raw_data['data']
    X, Y = [], []

    # Select the top K images for each of the 3 mood categories
    K = 1000 
    print(f"[*] Creating balanced dataset (Top-{K} for each of the 3 moods)...")
    
    for i in range(len(raw_data['prompts'])):
        # Extract scores for the current prompt category across all samples
        category_scores = [s['scores'][i] for s in samples]
        
        # Get indices of the top K highest scoring images
        top_indices = np.argsort(category_scores)[-K:]
        
        for idx in top_indices:
            z = samples[idx]['z'].flatten()
            
            # Robust Data Augmentation: Create 30 variations of audio features per image
            for _ in range(30): 
                if i == 0: # MOOD 1: RED / AGGRESSIVE
                    # Associates visuals with High Intensity audio features
                    feat = [get_feat('spectral_contrast', 'max'), get_feat('spectral_flatness', 'max'), 
                            get_feat('onset_strength', 'max'), get_feat('zero_crossing_rate', 'max'), 
                            get_feat('chroma_variance', 'med')]
                            
                elif i == 1: # MOOD 2: BLUE / CALM
                    # Associates visuals with Low Intensity audio features
                    feat = [get_feat('spectral_contrast', 'min'), get_feat('spectral_flatness', 'min'), 
                            get_feat('onset_strength', 'min'), get_feat('zero_crossing_rate', 'min'), 
                            get_feat('chroma_variance', 'min')]
                            
                else: # MOOD 3: VIBRANT / GEOMETRIC
                    # Associates visuals with Medium Intensity (but High Chroma)
                    feat = [get_feat('spectral_contrast', 'med'), get_feat('spectral_flatness', 'med'), 
                            get_feat('onset_strength', 'med'), get_feat('zero_crossing_rate', 'med'), 
                            get_feat('chroma_variance', 'max')]
                
                X.append(feat)
                Y.append(z)

    # Convert to NumPy arrays and normalize features
    X_np = np.array(X, dtype=np.float32)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_np)
    Y_np = np.array(Y, dtype=np.float32)

    # Prepare DataLoader
    loader = DataLoader(TensorDataset(torch.FloatTensor(X_scaled), torch.FloatTensor(Y_np)), batch_size=128, shuffle=True)
    
    # Initialize Model and Optimizer
    model = MoodMLP().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    
    print("[*] Training (300 epochs)...")
    for epoch in tqdm(range(300)):
        for bx, by in loader:
            bx, by = bx.to(device), by.to(device)
            
            optimizer.zero_grad()
            prediction = model(bx)
            loss = criterion(prediction, by)
            loss.backward()
            optimizer.step()

    # Save Model and Scaler
    model.to('cpu')
    torch.save(model.state_dict(), './resources/mood_mlp.pth')
    
    with open('./resources/scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
        
    print("[✔] Model saved! The GAN will now react to the 3 audio moods.")