# Training CIAP Contrastive Model on Google Colab
This notebook demonstrates how to train the CIAP contrastive model on Google Colab. The trained weights for the ImageEncoder and AudioEncoder models will be saved for later use.

## Section 1: Install Required Libraries
Install necessary libraries such as PyTorch, PyYAML, and tqdm using pip.

In [None]:
# Install required libraries
!pip install torch torchvision torchaudio
!pip install pyyaml tqdm
!pip install progressbar

## Section 2: Clone the Repository and Set Up Environment
Clone the AudioLDM repository, navigate to the required directory, and install dependencies.

In [None]:
# Remove existing repository if it exists and clone again
!rm -rf IE-643
!git clone https://github.com/luffy-taro-106/IE-643.git
%cd IE-643/AudioLDM

Cloning into 'IE-643'...
remote: Enumerating objects: 148, done.[K
remote: Counting objects: 100% (148/148), done.[K
remote: Compressing objects: 100% (112/112), done.[K
remote: Total 148 (delta 29), reused 147 (delta 28), pack-reused 0 (from 0)[K
Receiving objects: 100% (148/148), 1.47 MiB | 29.44 MiB/s, done.
Resolving deltas: 100% (29/29), done.
/content/IE-643/AudioLDM


## Section 3: Load Configuration
Load the configuration file (ciap_config.yaml) using PyYAML and parse the training parameters.

In [None]:
import yaml

def load_config(config_path):
    with open(config_path, "r") as f:
        return yaml.safe_load(f)

# Load the configuration file
config_path = "audioldm/ciap/configs/ciap_config.yaml"
config = load_config(config_path)

# Display the configuration
print(config)

{'model': {'type': 'ContrastiveModel', 'image_encoder': {'type': 'ImageEncoder', 'input_size': [3, 224, 224], 'output_size': 512, 'pretrained': True}, 'audio_encoder': {'type': 'AudioEncoder', 'input_size': [1, 16000], 'output_size': 512, 'pretrained': True}}, 'training': {'batch_size': 32, 'learning_rate': 0.001, 'num_epochs': 50, 'weight_decay': '1e-5', 'scheduler': {'type': 'StepLR', 'step_size': 10, 'gamma': 0.1}}, 'dataset': {'train': {'path': 'data/train', 'image_extension': '.jpg', 'audio_extension': '.wav'}, 'val': {'path': 'data/val', 'image_extension': '.jpg', 'audio_extension': '.wav'}}, 'logging': {'log_dir': 'logs', 'log_interval': 10}, 'device': {'type': 'cuda'}}


## Section 4: Define Dataset and DataLoader
Use the `PairedImageAudioDataset` class to define the dataset and create a DataLoader for training.

In [None]:
from torch.utils.data import DataLoader
from audioldm.ciap.datasets.paired_image_audio_dataset import PairedImageAudioDataset
import os

# Extract dataset parameters from the configuration
dataset_cfg = config.get("dataset", {})

from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Set dataset path - **Please verify this path and ensure it contains image and audio files**
dataset_path = "/content/drive/My Drive/data/train"


image_ext = dataset_cfg.get("image_extension", ".jpg")
audio_ext = dataset_cfg.get("audio_extension", ".wav")

    # Define the dataset and DataLoader
dataset = PairedImageAudioDataset(dataset_path, image_ext=image_ext, audio_ext=audio_ext)
batch_size = config["training"].get("batch_size", 32)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True)
print(f"Dataloader created with {len(dataloader)} batches.")

Mounted at /content/drive
Dataloader created with 127 batches.


## Section 5: Initialize Models
Initialize the `ImageEncoder` and `AudioEncoder` models using the configuration parameters.

In [None]:
# ...existing code...

# ---------- New: Model init + training loop using CLAP audio (HTSAT/PANN) + ResNet image encoder ----------
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

# Use the CIAP CLAP-style model (wraps clap audio model + your ResNet image encoder)
from audioldm.ciap.models.ciap_clap_model import CIAP_CLAP_Model
from audioldm.ciap.datasets.paired_image_audio_dataset import PairedImageAudioDataset

# Simple contrastive loss (InfoNCE-style) — reuse your notebook's implementation if present
import torch.nn.functional as F
class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.07):
        super().__init__()
        self.temperature = temperature

    def forward(self, image_embeddings, audio_embeddings):
        image_embeddings = F.normalize(image_embeddings, p=2, dim=-1)
        audio_embeddings = F.normalize(audio_embeddings, p=2, dim=-1)
        logits = torch.matmul(image_embeddings, audio_embeddings.T) / self.temperature
        labels = torch.arange(image_embeddings.size(0), device=image_embeddings.device)
        loss_i2a = F.cross_entropy(logits, labels)
        loss_a2i = F.cross_entropy(logits.T, labels)
        return 0.5 * (loss_i2a + loss_a2i)

# Load config values if available
cfg = config if "config" in globals() else {}
dataset_cfg = cfg.get("dataset", {})
training_cfg = cfg.get("training", {})

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Dataset
dataset_path = dataset_cfg.get("train", {}).get("path", "./data/train")
image_ext = dataset_cfg.get("image_extension", ".jpg")
audio_ext = dataset_cfg.get("audio_extension", ".wav")
dataset = PairedImageAudioDataset(dataset_path, image_ext=image_ext, audio_ext=audio_ext)
batch_size = int(training_cfg.get("batch_size", 16))
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
print("Dataset samples:", len(dataset), "Batches:", len(dataloader))

# Model: use CLAP audio model (HTSAT-tiny by default) and ResNet image encoder
amodel = training_cfg.get("amodel", "HTSAT-tiny")
embed_dim = int(training_cfg.get("embed_dim", 512))
pretrained_audio_ckpt = training_cfg.get("pretrained_audio_ckpt", "")

model = CIAP_CLAP_Model(amodel=amodel, tmodel="roberta", pretrained_path=pretrained_audio_ckpt, image_proj_dim=embed_dim, device=device)
model.to(device)
model.train()  # image encoder will be trained

# Optimizer: train image encoder params (and projection head if you add one)
trainable_params = list(model.image_encoder.parameters())
optimizer = optim.Adam(trainable_params, lr=float(training_cfg.get("learning_rate", 1e-4)), weight_decay=1e-6)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=int(training_cfg.get("lr_step", 10)), gamma=float(training_cfg.get("lr_gamma", 0.5)))

criterion = ContrastiveLoss(temperature=float(training_cfg.get("temperature", 0.07)))

# Training loop
epochs = int(training_cfg.get("epochs", 10))
save_dir = training_cfg.get("save_dir", "./ckpt")
os.makedirs(save_dir, exist_ok=True)

for epoch in range(1, epochs + 1):
    running_loss = 0.0
    model.train()
    loop = tqdm(enumerate(dataloader, start=1), total=len(dataloader), desc=f"Epoch {epoch}/{epochs}", leave=True)
    for batch_idx, (images, audios, *rest) in loop:
        images = images.to(device)
        audios = audios.to(device)

        optimizer.zero_grad()

        # prepare audio dicts expected by CLAP audio model
        audio_dicts = model.preprocess_audio_waveform(audios)
        aud_emb = model.get_audio_embedding(audio_dicts)      # [B, D]
        img_emb = model.get_image_embedding(images)           # [B, D]

        # normalize embeddings
        aud_emb = aud_emb / (aud_emb.norm(dim=-1, keepdim=True) + 1e-8)
        img_emb = img_emb / (img_emb.norm(dim=-1, keepdim=True) + 1e-8)

        loss = criterion(img_emb, aud_emb)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        avg_loss = running_loss / batch_idx
        loop.set_postfix({"batch_loss": f"{loss.item():.4f}", "avg_loss": f"{avg_loss:.4f}"})

    scheduler.step()
    print(f"Epoch {epoch} finished. Avg loss: {running_loss / len(dataloader):.4f}")

    # save image encoder checkpoint each epoch
    img_ckpt = os.path.join(save_dir, f"ciap_image_encoder_epoch{epoch}.pt")
    torch.save(model.image_encoder.state_dict(), img_ckpt)
    print("Saved image encoder to", img_ckpt)

print("Training complete.")
# ...existing code...

## Section 8: Save Trained Weights
Save the trained weights of the `ImageEncoder` and `AudioEncoder` models to the Colab environment.

In [None]:
import os

# Save trained weights
output_dir = "ciap_trained_weights"
os.makedirs(output_dir, exist_ok=True)

img_path = os.path.join(output_dir, "ciap_image_encoder.pt")
aud_path = os.path.join(output_dir, "ciap_audio_encoder.pt")
torch.save(image_encoder.state_dict(), img_path)
torch.save(audio_encoder.state_dict(), aud_path)

print(f"Saved image encoder -> {img_path}")
print(f"Saved audio encoder  -> {aud_path}")

In [None]:
import shutil
import os

# Define the source and destination paths
source_dir = "ciap_trained_weights"
destination_dir = "/content/drive/My Drive/ciap_trained_weights2" # Replace with your desired path in Google Drive

# Create the destination directory in Google Drive if it doesn't exist
os.makedirs(destination_dir, exist_ok=True)

# Copy the saved weights to Google Drive
try:
    shutil.copytree(source_dir, destination_dir, dirs_exist_ok=True)
    print(f"Successfully copied trained weights from {source_dir} to {destination_dir}")
except Exception as e:
    print(f"Error copying files: {e}")