In [None]:
import yaml

def load_config(config_path):
    with open(config_path, "r") as f:
        return yaml.safe_load(f)

config_path = "audioldm/ciap/configs/ciap_config.yaml"
config = load_config(config_path)

In [None]:
import torch
from audioldm.ciap.models.image_encoder import ImageEncoder
from audioldm.ciap.models.audio_encoder import AudioEncoder

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the models
image_encoder = ImageEncoder(config["model"]["image_encoder"]).to(device)
audio_encoder = AudioEncoder(config["model"]["audio_encoder"]).to(device)

# Load the checkpoints
image_encoder_ckpt = "/Users/luffy_sama/Desktop/Workspace/IE_643/AudioLDM/ckpt/ciap/ciap_image_encoder.pt"
audio_encoder_ckpt = "/Users/luffy_sama/Desktop/Workspace/IE_643/AudioLDM/ckpt/ciap/ciap_audio_encoder.pt"

image_encoder.load_state_dict(torch.load(image_encoder_ckpt, map_location=device))
audio_encoder.load_state_dict(torch.load(audio_encoder_ckpt, map_location=device))

print("Checkpoints loaded successfully!")

In [None]:
import os
import torch
from audioldm import build_model, save_wave
from audioldm.ciap.models.image_encoder import ImageEncoder
from audioldm.ciap.models.audio_encoder import AudioEncoder
from audioldm.ciap.datasets.paired_image_audio_dataset import PairedImageAudioDataset
from torch.utils.data import DataLoader

# Set CIAP models to evaluation mode
image_encoder.eval()
audio_encoder.eval()

# Load LDM model
ldm_ckpt_path = "./ckpt/audioldm-s-full.ckpt"
audioldm = build_model(ckpt_path=ldm_ckpt_path)

# Define dataset for inference
dataset_path = config["dataset"]["val"]["path"]  # Use the validation dataset path
image_ext = config["dataset"]["val"].get("image_extension", ".png")
audio_ext = config["dataset"]["val"].get("audio_extension", ".wav")

dataset = PairedImageAudioDataset(dataset_path, image_ext=image_ext, audio_ext=audio_ext)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True)

# Perform inference
output_dir = "./output"
os.makedirs(output_dir, exist_ok=True)

with torch.no_grad():
    for idx, (image, audio) in enumerate(dataloader):
        image = image.to(device)
        audio = audio.to(device)

        # Step 1: Get embeddings from CIAP model
        img_emb = image_encoder.encode(image) if hasattr(image_encoder, "encode") else image_encoder(image)
        aud_emb = audio_encoder.encode(audio) if hasattr(audio_encoder, "encode") else audio_encoder(audio)

        print(f"Sample {idx + 1}:")
        print(f"Image Embedding: {img_emb}")
        print(f"Audio Embedding: {aud_emb}")

        # Step 2: Use LDM to generate audio from embeddings
        # Here, we use the text prompt as a placeholder for embeddings
        text_prompt = f"Generated audio for sample {idx + 1}"
        waveform = audioldm.generate(
            text=text_prompt,
            seed=42,
            duration=10.0,
            guidance_scale=2.5,
            n_candidate_gen_per_text=1,
            batchsize=1,
        )

        # Step 3: Save the generated audio
        save_wave(waveform, save_path=output_dir, name=f"generated_audio_{idx + 1}")
        print(f"Generated audio saved to {output_dir}/generated_audio_{idx + 1}.wav")
        print("-" * 50)