# Training CIAP Contrastive Model on Google Colab
This notebook demonstrates how to train the CIAP contrastive model on Google Colab. The trained weights for the ImageEncoder and AudioEncoder models will be saved for later use.

## Section 1: Install Required Libraries
Install necessary libraries such as PyTorch, PyYAML, and tqdm using pip.

## Section 2: Clone the Repository and Set Up Environment
Clone the AudioLDM repository, navigate to the required directory, and install dependencies.

## Section 3: Load Configuration
Load the configuration file (ciap_config.yaml) using PyYAML and parse the training parameters.

In [1]:
import yaml

def load_config(config_path):
    with open(config_path, "r") as f:
        return yaml.safe_load(f)

# Load the configuration file
config_path = "audioldm/ciap/configs/ciap_config.yaml"
config = load_config(config_path)

# Display the configuration
print(config)

{'model': {'type': 'ContrastiveModel', 'image_encoder': {'type': 'ImageEncoder', 'input_size': [3, 224, 224], 'output_size': 512, 'pretrained': True}, 'audio_encoder': {'type': 'AudioEncoder', 'input_size': [1, 16000], 'output_size': 512, 'pretrained': True}}, 'training': {'batch_size': 32, 'learning_rate': 0.001, 'num_epochs': 50, 'weight_decay': '1e-5', 'scheduler': {'type': 'StepLR', 'step_size': 10, 'gamma': 0.1}}, 'dataset': {'train': {'path': 'data/train', 'image_extension': '.jpg', 'audio_extension': '.wav'}, 'val': {'path': 'data/val', 'image_extension': '.jpg', 'audio_extension': '.wav'}}, 'logging': {'log_dir': 'logs', 'log_interval': 10}, 'device': {'type': 'cuda'}}


## Section 4: Define Dataset and DataLoader
Use the `PairedImageAudioDataset` class to define the dataset and create a DataLoader for training.

In [2]:
from torch.utils.data import DataLoader
from audioldm.ciap.datasets.paired_image_audio_dataset import PairedImageAudioDataset

# Extract dataset parameters from the configuration
dataset_cfg = config.get("dataset", {})

# from google.colab import drive

# # Mount Google Drive
# drive.mount('/content/drive')

# # Set dataset path - **Please verify this path and ensure it contains image and audio files**
# dataset_path = "/content/drive/My Drive/data/train"

dataset_path = dataset_cfg.get("dataset_path", "./data/train")
image_ext = dataset_cfg.get("image_extension", ".jpg")
audio_ext = dataset_cfg.get("audio_extension", ".wav")

# Define the dataset and DataLoader
dataset = PairedImageAudioDataset(dataset_path, image_ext=image_ext, audio_ext=audio_ext)
batch_size = config["training"].get("batch_size", 32)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True)

## Section 5: Initialize Models
Initialize the `ImageEncoder` and `AudioEncoder` models using the configuration parameters.

In [3]:
import torch
from audioldm.ciap.models.image_encoder import ImageEncoder
from audioldm.ciap.models.audio_encoder import AudioEncoder

# Initialize models
device_config = config.get("device", {"type": "cpu"})
# Check if CUDA is available and use GPU if it is, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
image_encoder = ImageEncoder(config["model"]["image_encoder"]).to(device)
audio_encoder = AudioEncoder(config["model"]["audio_encoder"]).to(device)

# Set models to training mode
image_encoder.train()
audio_encoder.train()

AudioEncoder(
  (fc1): Linear(in_features=16000, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=True)
  (relu): ReLU()
)

## Section 6: Define Loss Function and Optimizer
Set up the `ContrastiveLoss` function and Adam optimizer for training.

In [4]:
import torch.optim as optim
from audioldm.ciap.losses.contrastive_loss import ContrastiveLoss

# Define loss function and optimizer
criterion = ContrastiveLoss()
lr = config["training"].get("learning_rate", 1e-3)
optimizer = optim.Adam(list(image_encoder.parameters()) + list(audio_encoder.parameters()), lr=lr)

## Section 7: Training Loop
Implement the training loop with tqdm for progress tracking, and calculate the loss for each batch.

In [None]:
from tqdm import tqdm

# Training parameters
num_epochs = config["training"].get("num_epochs", 10)

# Training loop
for epoch in range(1, num_epochs + 1):
    running_loss = 0.0
    loop = tqdm(enumerate(dataloader, start=1), total=len(dataloader),
                desc=f"Epoch {epoch}/{num_epochs}", leave=True)
    for batch_idx, (images, audios) in loop:
        images = images.to(device)
        audios = audios.to(device)

        optimizer.zero_grad()

        # Use encode() if available; fallback to forward()
        img_emb = image_encoder.encode(images) if hasattr(image_encoder, "encode") else image_encoder(images)
        aud_emb = audio_encoder.encode(audios) if hasattr(audio_encoder, "encode") else audio_encoder(audios)

        # Labels: i-th image matches i-th audio


        loss = criterion(img_emb, aud_emb)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        avg_loss = running_loss / batch_idx
        loop.set_postfix({"avg_loss": f"{avg_loss:.4f}", "batch_loss": f"{loss.item():.4f}"})

    print(f"Epoch {epoch} finished. EpochLoss: {running_loss/len(dataloader):.4f}")

Epoch 1/50:  21%|██▏       | 27/127 [00:36<02:11,  1.32s/it, avg_loss=3.6003, batch_loss=3.4638]

## Section 8: Save Trained Weights
Save the trained weights of the `ImageEncoder` and `AudioEncoder` models to the Colab environment.

In [None]:
import os

# Save trained weights
output_dir = "ciap_trained_weights"
os.makedirs(output_dir, exist_ok=True)

img_path = os.path.join(output_dir, "ciap_image_encoder.pt")
aud_path = os.path.join(output_dir, "ciap_audio_encoder.pt")
torch.save(image_encoder.state_dict(), img_path)
torch.save(audio_encoder.state_dict(), aud_path)

print(f"Saved image encoder -> {img_path}")
print(f"Saved audio encoder  -> {aud_path}")