# Core Utilities

> Basic utilities, constants, and shared functions for the Indic-CLIP project.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import os
from pathlib import Path
import logging

## Constants

In [None]:
#| export
# --- Project Root --- 
# Assumes the script/notebook is run from the project root or 'nbs' directory
PROJECT_ROOT = Path(os.getenv("INDIC_CLIP_ROOT", default=Path.cwd()))
if PROJECT_ROOT.name == 'nbs':
    PROJECT_ROOT = PROJECT_ROOT.parent

# --- Data Paths ---
DATA_PATH = PROJECT_ROOT / 'data'
RAW_DATA_PATH = DATA_PATH / 'raw'
HINDI_RAW_PATH = RAW_DATA_PATH / 'hindi'
SANSKRIT_RAW_PATH = RAW_DATA_PATH / 'sanskrit'
SYNTHETIC_RAW_PATH = RAW_DATA_PATH / 'synthetic'
PROCESSED_DATA_PATH = DATA_PATH / 'processed'
BENCHMARK_DATA_PATH = DATA_PATH / 'benchmarks'

# --- Model Paths ---
MODEL_PATH = PROJECT_ROOT / 'models'
CHECKPOINT_PATH = MODEL_PATH / 'checkpoints'
ONNX_PATH = MODEL_PATH / 'onnx'
QUANTIZED_PATH = MODEL_PATH / 'quantized'

# --- Tokenizer Paths/Files ---
TOKENIZER_PATH = PROCESSED_DATA_PATH # Store tokenizer models/vocabs with processed data
TOKENIZER_MODEL_FILE = TOKENIZER_PATH / 'indic_tokenizer.model'
TOKENIZER_VOCAB_FILE = TOKENIZER_PATH / 'indic_tokenizer.vocab'

# --- Special Tokens ---
PAD_TOKEN = "[PAD]"
UNK_TOKEN = "[UNK]"
CLS_TOKEN = "[CLS]"
SEP_TOKEN = "[SEP]"
MASK_TOKEN = "[MASK]"
SANSKRIT_TOKEN = "<Sa>"
HINDI_TOKEN = "<Hi>"

SPECIAL_TOKENS = [PAD_TOKEN, UNK_TOKEN, CLS_TOKEN, SEP_TOKEN, MASK_TOKEN, SANSKRIT_TOKEN, HINDI_TOKEN]

# --- Default Values ---
DEFAULT_IMAGE_SIZE = 224
DEFAULT_BATCH_SIZE = 64
DEFAULT_VOCAB_SIZE = 32000 # Example, adjust after tokenizer training
DEFAULT_EMBED_DIM = 512     # Example, adjust based on chosen model architecture

## Utility Functions

In [None]:
#| export
def setup_logging(level=logging.INFO):
    """Configures basic logging for the project."""
    logging.basicConfig(
        level=level,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )

def get_logger(name):
    """Returns a logger instance."""
    return logging.getLogger(name)

In [None]:
#| export
# Example Usage (remove or comment out in final version if not needed in core export)
if __name__ == '__main__':
    setup_logging()
    logger = get_logger(__name__)
    logger.info(f"Project Root: {PROJECT_ROOT}")
    logger.info(f"Data Path: {DATA_PATH}")
    logger.info(f"Model Path: {MODEL_PATH}")
    logger.info(f"Special Tokens: {SPECIAL_TOKENS}")

In [None]:
#| hide
# Trigger export
import nbdev; 
# nbdev.nbdev_export() # Run this manually in terminal after editing