# IndicCLIP Model

> Defines the main IndicCLIP model architecture, combining the vision and text encoders.

In [None]:
#| default_exp model.clip

## Colab Setup

In [None]:
#| hide
# Mount Google Drive (Optional, but recommended for persistent storage)
from pathlib import Path

try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except ModuleNotFoundError:
    print("Not running in Colab, skipping Drive mount.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.


In [None]:
#| export
try:
    import indic_clip.core
    print("Reloaded indic_clip.core")
except ModuleNotFoundError:
    print("indic_clip.core not found initially.")
    # Attempt to set sys.path if running in Colab and project cloned
    import sys
    if 'google.colab' in sys.modules:
        project_parent = '/content' # Assuming cloned into /content/indic-clip
        if Path('/content/drive/MyDrive/Indic-Clip').exists():
             project_parent = '/content/drive/MyDrive/Indic-Clip'
        if project_parent not in sys.path:
             sys.path.insert(0, project_parent)
             print(f"Added {project_parent} to sys.path")
        try:
            import indic_clip.core
            print("Imported indic_clip.core after path adjustment.")
        except ModuleNotFoundError:
            print("ERROR: Still cannot find indic_clip.core. Ensure project structure is correct.")
            print("Expected: /content/Indic-Clip/indic_clip/core.py or similar in Drive")
            # raise # Stop execution if core components missing

indic_clip.core not found initially.
Added /content/drive/MyDrive/Indic-Clip to sys.path
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive detected, setting PROJECT_ROOT to /content/drive/MyDrive/Indic-Clip
Ensure your project files are located there.
Imported indic_clip.core after path adjustment.


In [None]:
#| hide
%cd /content/drive/MyDrive/Indic-Clip/

/content/drive/MyDrive/Indic-Clip


In [None]:
#| hide
!pip install -qr requirements.txt

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.8/297.8 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.9/46.9 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.2/322.2 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import logging

from fastai.vision.all import *

try:
    # Import necessary components from our project
    from indic_clip.core import get_logger, setup_logging, DEFAULT_EMBED_DIM, PRETRAINED_TOKENIZER_NAME
    from indic_clip.model.vision import VisionEncoder
    from indic_clip.model.text import TextEncoder
    from indic_clip.data.tokenization import IndicBERTTokenizer # Needed if passing tokenizer
except ModuleNotFoundError:
    print('MODULE NOT FOUND')
    # Fallback if core not found (e.g. testing)
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    def get_logger(name): return logging.getLogger(name)
    def setup_logging(): pass
    DEFAULT_EMBED_DIM = 768
    PRETRAINED_TOKENIZER_NAME = "ai4bharat/indic-bert"
    # Dummy classes if modules aren't available
    class VisionEncoder(nn.Module):
        def __init__(self, model_name='dummy_vision', pretrained=True, output_dim=None):
            super().__init__()
            self.feature_dim = 768
            self.backbone = nn.Identity()
            self.projection = nn.Linear(768, output_dim) if output_dim else nn.Identity()
        def forward(self, x): return self.projection(torch.randn(x.shape[0], self.feature_dim))
        def set_gradient_checkpointing(self, enable): pass

    class TextEncoder(nn.Module):
        def __init__(self, model_name='dummy_text', pretrained=True, output_dim=None, tokenizer=None):
            super().__init__()
            self.feature_dim = 768
            self.backbone = nn.Identity()
            self.projection = nn.Linear(768, output_dim) if output_dim else nn.Identity()
        def forward(self, input_ids, attention_mask): return self.projection(torch.randn(input_ids.shape[0], self.feature_dim))
        def set_gradient_checkpointing(self, enable): pass

    class IndicBERTTokenizer:
         def __init__(self, *args, **kwargs): self.vocab_size = 30000
         def tokenize(self, texts):
            if isinstance(texts, str): texts=[texts]
            ids = torch.randint(0, self.vocab_size, (len(texts), 10))
            mask = torch.ones_like(ids)
            return {'input_ids': ids, 'attention_mask': mask}
         @classmethod
         def load_tokenizer(cls, *args, **kwargs):
             return cls()


setup_logging()
logger = get_logger(__name__)

## IndicCLIP Model Definition

In [None]:
#| export
class IndicCLIP(Module):
    """The main IndicCLIP model, combining Vision and Text Encoders.

    This module integrates image and text processing pipelines and projects
    their features into a shared embedding space for contrastive learning.
    """
    def __init__(self,
                 embed_dim: int = DEFAULT_EMBED_DIM,
                 vision_model_name: str = 'vit_base_patch16_224',
                 vision_pretrained: bool = True,
                 text_model_name: str = PRETRAINED_TOKENIZER_NAME, # Use tokenizer's base model
                 text_pretrained: bool = True,
                 tokenizer: IndicBERTTokenizer = None):
        """
        Initializes the IndicCLIP model.

        Args:
            embed_dim (int): The dimension of the shared embedding space.
            vision_model_name (str): Name of the timm vision model.
            vision_pretrained (bool): Whether to load pretrained weights for the vision model.
            text_model_name (str): Name or path of the Hugging Face text model.
            text_pretrained (bool): Whether to load pretrained weights for the text model.
            tokenizer (IndicBERTTokenizer): The tokenizer instance, needed for text encoder setup (embedding resize).
        """
        if tokenizer is None:
             logger.warning("No tokenizer provided to IndicCLIP. Text encoder might not resize embeddings correctly.")
             # Attempt to load a default one - this might fail if path isn't set up
             # from indic_clip.core import TOKENIZER_PATH
             # tokenizer = IndicBERTTokenizer.load_tokenizer(TOKENIZER_PATH)


        self.vision_encoder = VisionEncoder(
            model_name=vision_model_name,
            pretrained=vision_pretrained,
            output_dim=None # Projection handled below
        )

        self.text_encoder = TextEncoder(
            model_name=text_model_name,
            pretrained=text_pretrained,
            output_dim=None, # Projection handled below
            tokenizer=tokenizer # Pass tokenizer for potential embedding resize
        )

        # --- Projection Heads ---
        # Project features from vision/text backbones to the shared embed_dim
        if self.vision_encoder.feature_dim is None or self.text_encoder.feature_dim is None:
             raise ValueError("Could not determine feature dimensions for vision or text encoders.")

        self.visual_projection = nn.Linear(self.vision_encoder.feature_dim, embed_dim, bias=False)
        self.text_projection = nn.Linear(self.text_encoder.feature_dim, embed_dim, bias=False)

        # Initialize projection layers (optional, but common)
        # Often initialized to match CLIP's initialization if transferring
        # Default PyTorch init is Kaiming Uniform for Linear layers

        # --- Logit Scale ---
        # Learnable parameter for scaling similarity scores
        # Initialized according to OpenAI CLIP paper (log(1/0.07))
        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

        logger.info(f"IndicCLIP initialized with vision='{vision_model_name}', text='{text_model_name}', embed_dim={embed_dim}")

    def encode_image(self, image: torch.Tensor) -> torch.Tensor:
        """Encodes an image into the shared embedding space.

        Args:
            image (torch.Tensor): Input image tensor (B, C, H, W).

        Returns:
            torch.Tensor: Image features projected into the embedding space (B, embed_dim), L2-normalized.
        """
        image_features = self.vision_encoder(image)
        projected_features = self.visual_projection(image_features)
        # Normalize features
        normalized_features = F.normalize(projected_features, p=2, dim=-1)
        return normalized_features

    def encode_text(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        """Encodes text into the shared embedding space.

        Args:
            input_ids (torch.Tensor): Input token IDs (B, SeqLen).
            attention_mask (torch.Tensor): Attention mask (B, SeqLen).

        Returns:
            torch.Tensor: Text features projected into the embedding space (B, embed_dim), L2-normalized.
        """
        text_features = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        projected_features = self.text_projection(text_features)
        # Normalize features
        normalized_features = F.normalize(projected_features, p=2, dim=-1)
        return normalized_features

    def forward(self, image: torch.Tensor,
                text_input: tuple) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Forward pass for training. Encodes both image and text.
        Accepts image tensor and text tensor tuple separately from Learner unpacking xb.

        Args:
            image (torch.Tensor): Input image tensor (B, C, H, W).
            text_input (tuple): A tuple containing:
                - input_ids (torch.Tensor): Input token IDs (B, SeqLen).
                - attention_mask (torch.Tensor): Attention mask (B, SeqLen).

        Returns:
            tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
                - image_features: Normalized image features (B, embed_dim).
                - text_features: Normalized text features (B, embed_dim).
                - logit_scale: The learned logit scaling factor (scalar tensor, exponentiated).
        """
        input_ids, attention_mask = text_input # Unpack the text tuple received as the second argument

        image_features = self.encode_image(image)
        # Pass the unpacked tensors to encode_text
        text_features = self.encode_text(input_ids, attention_mask)

        # Clamp the logit scale parameter before exponentiating
        # Clamp log(1/T) to avoid T becoming too small (e.g., T > 0.01 -> log(1/T) < log(100))
        self.logit_scale.data.clamp_(max=np.log(1 / 0.01)) # Max log value ~4.605

        # Return the exponentiated clamped value
        logit_scale_exp = self.logit_scale.exp()

        # logit_scale.exp() is typically applied in the loss function
        # Return the raw parameter here, but exponentiate for clarity in return type
        return image_features, text_features, logit_scale_exp

    def set_gradient_checkpointing(self, enable: bool = True):
        """Enable or disable gradient checkpointing for both encoders."""
        self.vision_encoder.set_gradient_checkpointing(enable)
        self.text_encoder.set_gradient_checkpointing(enable)
        logger.info(f"IndicCLIP gradient checkpointing {'enabled' if enable else 'disabled'} for both encoders.")

## Example Usage

In [None]:
#| eval: false
if __name__ == '__main__':
    print("--- IndicCLIP Example --- ")
    try:
        # Ensure tokenizer is available (needed by TextEncoder init)
        # Use a simple instance for testing if not running full pipeline
        test_tokenizer = IndicBERTTokenizer.load_tokenizer()

        # 1. Instantiate the IndicCLIP model
        # Use smaller embed_dim for faster testing if needed
        model = IndicCLIP(
            embed_dim=768,
            vision_model_name='vit_base_patch16_224', # Or 'resnet18' for faster test
            vision_pretrained=True,
            text_model_name=PRETRAINED_TOKENIZER_NAME,
            text_pretrained=True,
            tokenizer=test_tokenizer # Pass the loaded tokenizer
        )
        model.eval() # Set to evaluation mode for testing
        print("Model Instantiated.")

        # 2. Create dummy inputs
        batch_size = 4
        img_size = 224 # Must match vision model expectation
        seq_len = 32

        dummy_images = torch.randn(batch_size, 3, img_size, img_size)
        # Use the actual tokenizer to create realistic token IDs
        dummy_texts = ["यह एक उदाहरण वाक्य है।"] * batch_size
        tokenized = test_tokenizer.tokenize(dummy_texts)
        dummy_input_ids = tokenized['input_ids'][:, :seq_len] # Truncate/pad if needed
        dummy_attn_mask = tokenized['attention_mask'][:, :seq_len]
        # Adjust shapes if tokenizer max_len is different
        if dummy_input_ids.shape[1] < seq_len:
            pad_len = seq_len - dummy_input_ids.shape[1]
            pad_tensor_ids = torch.full((batch_size, pad_len), test_tokenizer.pad_token_id or 0, dtype=torch.long)
            pad_tensor_mask = torch.zeros((batch_size, pad_len), dtype=torch.long)
            dummy_input_ids = torch.cat([dummy_input_ids, pad_tensor_ids], dim=1)
            dummy_attn_mask = torch.cat([dummy_attn_mask, pad_tensor_mask], dim=1)


        print(f"\nDummy Image Input Shape: {dummy_images.shape}")
        print(f"Dummy Text Input IDs Shape: {dummy_input_ids.shape}")
        print(f"Dummy Text Mask Shape: {dummy_attn_mask.shape}")

        # 3. Test forward pass
        print("\nTesting forward pass...")
        with torch.no_grad():
            img_feat, txt_feat, logit_val = model(dummy_images, dummy_input_ids, dummy_attn_mask)

        print(f"Forward pass output type: {type((img_feat, txt_feat, logit_val))}")
        print(f"Image Features Shape: {img_feat.shape}")
        print(f"Text Features Shape: {txt_feat.shape}")
        print(f"Logit Scale Value: {logit_val.item():.4f}...")

        assert img_feat.shape == (batch_size, model.visual_projection.out_features)
        assert txt_feat.shape == (batch_size, model.text_projection.out_features)
        assert img_feat.shape == txt_feat.shape
        assert logit_val.ndim == 0 # Scalar tensor

        # 4. Test individual encoders
        print("\nTesting encode_image...")
        with torch.no_grad():
            encoded_img = model.encode_image(dummy_images)
        print(f"Encoded Image Features Shape: {encoded_img.shape}")
        print(f"Encoded Image Features Norm (sum of squares): {encoded_img.norm(dim=-1).pow(2)}")
        assert encoded_img.shape == img_feat.shape
        # Check normalization (norm should be close to 1)
        assert torch.allclose(encoded_img.norm(dim=-1), torch.ones(batch_size), atol=1e-6)

        print("\nTesting encode_text...")
        with torch.no_grad():
            encoded_txt = model.encode_text(dummy_input_ids, dummy_attn_mask)
        print(f"Encoded Text Features Shape: {encoded_txt.shape}")
        print(f"Encoded Text Features Norm (sum of squares): {encoded_txt.norm(dim=-1).pow(2)}")
        assert encoded_txt.shape == txt_feat.shape
        # Check normalization
        assert torch.allclose(encoded_txt.norm(dim=-1), torch.ones(batch_size), atol=1e-6)

        print("\nIndicCLIP test completed successfully.")

    except Exception as e:
        print(f"An error occurred during IndicCLIP example: {e}")
        import traceback
        traceback.print_exc()

--- IndicCLIP Example --- 


2025-04-18 10:18:52 - indic_clip.data.tokenization - INFO - Successfully loaded tokenizer: /content/drive/MyDrive/Indic-Clip/models/tokenizer
2025-04-18 10:18:52 - indic_clip.data.tokenization - INFO - Custom special tokens already exist or none were specified.
2025-04-18 10:18:52 - indic_clip.data.tokenization - INFO - Tokenizer state loaded successfully from /content/drive/MyDrive/Indic-Clip/models/tokenizer
2025-04-18 10:18:55 - timm.models._builder - INFO - Loading pretrained weights from Hugging Face hub (timm/vit_base_patch16_224.augreg2_in21k_ft_in1k)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

2025-04-18 10:18:58 - timm.models._hub - INFO - [timm/vit_base_patch16_224.augreg2_in21k_ft_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.
2025-04-18 10:18:58 - timm.models._builder - INFO - Missing keys (fc_norm.weight, fc_norm.bias) discovered while loading pretrained weights. This is expected if model is being adapted.
2025-04-18 10:18:58 - indic_clip.model.vision - INFO - Loaded timm model: vit_base_patch16_224 with pretrained=True
2025-04-18 10:18:58 - indic_clip.model.vision - INFO - Backbone feature dimension: 768


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

2025-04-18 10:18:59 - indic_clip.model.text - INFO - Loading text model: ai4bharat/indic-bert with pretrained=True
2025-04-18 10:18:59 - indic_clip.model.text - INFO - Model hidden dimension: 768


pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

2025-04-18 10:19:11 - indic_clip.model.text - INFO - Model embedding size resized to 200002
2025-04-18 10:19:11 - __main__ - INFO - IndicCLIP initialized with vision='vit_base_patch16_224', text='ai4bharat/indic-bert', embed_dim=768


Model Instantiated.

Dummy Image Input Shape: torch.Size([4, 3, 224, 224])
Dummy Text Input IDs Shape: torch.Size([4, 32])
Dummy Text Mask Shape: torch.Size([4, 32])

Testing forward pass...


model.safetensors:   0%|          | 0.00/135M [00:00<?, ?B/s]

Forward pass output type: <class 'tuple'>
Image Features Shape: torch.Size([4, 768])
Text Features Shape: torch.Size([4, 768])
Logit Scale Value: 14.2857...

Testing encode_image...
Encoded Image Features Shape: torch.Size([4, 768])
Encoded Image Features Norm (sum of squares): tensor([1.0000, 1.0000, 1.0000, 1.0000])

Testing encode_text...
Encoded Text Features Shape: torch.Size([4, 768])
Encoded Text Features Norm (sum of squares): tensor([1.0000, 1.0000, 1.0000, 1.0000])

IndicCLIP test completed successfully.


In [None]:
#| hide
import nbdev
nbdev.nbdev_export() # Run this in terminal to export