# Data Preprocessing

> Functions for cleaning and filtering raw image-text pair data. Includes image dimension/aspect ratio filtering, text length filtering, and duplicate removal using image hashing and text matching.

In [None]:
#| default_exp data.preprocessing

In [1]:
#| hide
from nbdev.showdoc import *

In [2]:
#| export
from pathlib import Path
try:
    import indic_clip.core
    print("Reloaded indic_clip.core")
except ModuleNotFoundError:
    print("indic_clip.core not found initially.")
    # Attempt to set sys.path if running in Colab and project cloned
    import sys
    if 'google.colab' in sys.modules:
        project_parent = '/content' # Assuming cloned into /content/indic-clip
        if Path('/content/drive/MyDrive/Indic-Clip').exists():
             project_parent = '/content/drive/MyDrive/Indic-Clip'
        if project_parent not in sys.path:
             sys.path.insert(0, project_parent)
             print(f"Added {project_parent} to sys.path")
        try:
            import indic_clip.core
            print("Imported indic_clip.core after path adjustment.")
        except ModuleNotFoundError:
            print("ERROR: Still cannot find indic_clip.core. Ensure project structure is correct.")
            print("Expected: /content/Indic-Clip/indic_clip/core.py or similar in Drive")
            # raise # Stop execution if core components missing
    else:
        project_parent = '/workspace'
        if Path('/workspace/indic-clip').exists():
             project_parent = '/workspace/indic-clip'
        if project_parent not in sys.path:
             sys.path.insert(0, project_parent)
             print(f"Added {project_parent} to sys.path")
        try:
            import indic_clip.core
            print("Imported indic_clip.core after path adjustment.")
        except ModuleNotFoundError:
            print("ERROR: Still cannot find indic_clip.core. Ensure project structure is correct.")
            print("Expected: /workspace/indic-clip/indic-clip/core.py or similar in Drive")

indic_clip.core not found initially.
Added /workspace/indic-clip to sys.path
Imported indic_clip.core after path adjustment.


In [3]:
#| export
import os
import json
import logging
from pathlib import Path
from PIL import Image, UnidentifiedImageError
import imagehash
import pandas as pd # Optional, but useful for handling dataframes
from tqdm.notebook import tqdm

try:
    from indic_clip.core import (
        HINDI_RAW_PATH,
        SANSKRIT_RAW_PATH,
        SYNTHETIC_RAW_PATH,
        PROCESSED_DATA_PATH,
        get_logger,
        setup_logging,
        ensure_dir
    )
except ModuleNotFoundError:
    print("Error importing from indic_clip.core. Using Fallbacks.")
    # Fallbacks if core isn't importable (e.g., interactive testing)
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    logger = logging.getLogger(__name__)
    if 'google.colab' in sys.modules:
        PROJECT_ROOT=Path('/content/Indic-Clip')
        if Path('/content/drive/MyDrive/Indic-Clip').exists():
           PROJECT_ROOT=Path('/content/drive/MyDrive/Indic-Clip')
    else:
        PROJECT_ROOT=Path('.').resolve()
        if PROJECT_ROOT.name == 'nbs': PROJECT_ROOT = PROJECT_ROOT.parent
    DATA_PATH = PROJECT_ROOT / 'data'
    RAW_DATA_PATH = DATA_PATH / 'raw'
    HINDI_RAW_PATH = RAW_DATA_PATH / 'hindi'
    SANSKRIT_RAW_PATH = RAW_DATA_PATH / 'sanskrit'
    SYNTHETIC_RAW_PATH = RAW_DATA_PATH / 'synthetic'
    PROCESSED_DATA_PATH = DATA_PATH / 'processed'
    def get_logger(name): return logging.getLogger(name)
    def setup_logging(): pass
    def ensure_dir(path: Path): path.mkdir(parents=True, exist_ok=True)

# Setup logging
setup_logging()
logger = get_logger(__name__)

## Configuration

In [4]:
#| export
# --- Filtering Thresholds ---
MIN_IMAGE_RESOLUTION = 224 # Minimum width and height
MAX_IMAGE_RESOLUTION = 4096 # Optional: Maximum width/height
MIN_ASPECT_RATIO = 1/3   # Allow images like 1:3
MAX_ASPECT_RATIO = 3     # Allow images like 3:1
MIN_TEXT_LENGTH = 5      # Minimum number of characters in caption
MAX_TEXT_LENGTH = 256    # Maximum number of characters in caption

# --- Output ---
FILTERED_DATA_FILENAME = "filtered_data.jsonl"
FILTERED_OUTPUT_PATH = PROCESSED_DATA_PATH

## Helper Functions

In [5]:
#| export
def load_raw_data(jsonl_path: Path) -> list:
    """Loads raw data from a JSONL file.

    Args:
        jsonl_path: Path to the input JSONL file (e.g., flickr8k_hindi_raw.jsonl).

    Returns:
        A list of dictionaries, where each dictionary represents a row.
        Returns an empty list if the file is not found or is empty.
    """
    data = []
    if not jsonl_path.exists():
        logger.error(f"Raw data file not found: {jsonl_path}")
        return data

    try:
        with open(jsonl_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data.append(json.loads(line.strip()))
                except json.JSONDecodeError:
                    logger.warning(f"Skipping invalid JSON line in {jsonl_path}: {line.strip()}")
        logger.info(f"Loaded {len(data)} items from {jsonl_path}")
        return data
    except Exception as e:
        logger.error(f"Error loading raw data from {jsonl_path}: {e}")
        return [] # Return empty list on error

In [6]:
#| export
def get_image_metadata(image_path: Path) -> dict | None:
    """Gets metadata (width, height, aspect ratio) for an image.

    Args:
        image_path: Path to the image file.

    Returns:
        A dictionary {'width': int, 'height': int, 'aspect_ratio': float} or None if error.
    """
    try:
        with Image.open(image_path) as img:
            width, height = img.size
            if width == 0 or height == 0:
                logger.warning(f"Image has zero dimension: {image_path} (Size: {width}x{height})")
                return None
            aspect_ratio = width / height
            return {"width": width, "height": height, "aspect_ratio": aspect_ratio}
    except FileNotFoundError:
        logger.warning(f"Image file not found: {image_path}")
        return None
    except UnidentifiedImageError:
        logger.warning(f"Could not identify image file (possibly corrupt): {image_path}")
        return None
    except Exception as e:
        logger.error(f"Error processing image {image_path}: {e}")
        return None

In [7]:
#| export
def calculate_image_hash(image_path: Path) -> str | None:
    """Calculates the perceptual hash (phash) of an image.

    Args:
        image_path: Path to the image file.

    Returns:
        The phash string or None if hashing fails.
    """
    try:
        with Image.open(image_path) as img:
            img_hash = imagehash.phash(img)
            return str(img_hash)
    except FileNotFoundError:
        # Already logged in get_image_metadata typically, but can log again if needed
        # logger.warning(f"Image file not found for hashing: {image_path}")
        return None
    except UnidentifiedImageError:
        # logger.warning(f"Could not identify image file for hashing: {image_path}")
        return None
    except Exception as e:
        logger.error(f"Error calculating hash for image {image_path}: {e}")
        return None

## Filtering Logic

In [8]:
#| export
def filter_data(raw_data: list, base_image_path: Path) -> list:
    """Applies filtering rules to the raw data.

    Filters include:
    - Image resolution (min/max)
    - Image aspect ratio (min/max)
    - Text length (min/max)
    - Duplicate removal (based on image phash OR exact text match)

    Args:
        raw_data: List of dictionaries loaded from raw JSONL.
                  Expected keys: 'image_path_relative', 'caption'.
        base_image_path: The base directory where images corresponding to
                         'image_path_relative' are stored (e.g., HINDI_RAW_PATH).

    Returns:
        A list of filtered dictionaries.
    """
    filtered_list = []
    seen_image_hashes = set()
    seen_captions = set()
    skipped_counts = {
        "invalid_entry": 0,
        "image_error": 0,
        "resolution": 0,
        "aspect_ratio": 0,
        "text_length": 0,
        "duplicate_image": 0,
        "duplicate_caption": 0,
    }

    logger.info(f"Starting filtering process for {len(raw_data)} raw items...")

    for item in tqdm(raw_data, desc="Filtering Data"):
        if not isinstance(item, dict) or 'image_path_relative' not in item or 'caption' not in item:
            skipped_counts["invalid_entry"] += 1
            # logger.warning(f"Skipping invalid entry: {item}")
            continue

        relative_img_path = item['image_path_relative']
        caption = item['caption']
        full_image_path = base_image_path / relative_img_path

        # 1. Filter by Image Metadata
        metadata = get_image_metadata(full_image_path)
        if metadata is None:
            skipped_counts["image_error"] += 1
            continue # Skip if image can't be opened or has errors

        if not (MIN_IMAGE_RESOLUTION <= metadata['width'] <= MAX_IMAGE_RESOLUTION and
                MIN_IMAGE_RESOLUTION <= metadata['height'] <= MAX_IMAGE_RESOLUTION):
            skipped_counts["resolution"] += 1
            # logger.debug(f"Skipping {relative_img_path}: Resolution ({metadata['width']}x{metadata['height']}) out of bounds.")
            continue

        if not (MIN_ASPECT_RATIO <= metadata['aspect_ratio'] <= MAX_ASPECT_RATIO):
            skipped_counts["aspect_ratio"] += 1
            # logger.debug(f"Skipping {relative_img_path}: Aspect ratio ({metadata['aspect_ratio']:.2f}) out of bounds.")
            continue

        # 2. Filter by Text Length
        # Ensure caption is a string before checking length
        if not isinstance(caption, str) or not (MIN_TEXT_LENGTH <= len(caption) <= MAX_TEXT_LENGTH):
            skipped_counts["text_length"] += 1
            # logger.debug(f"Skipping {relative_img_path}: Caption length ({len(caption) if isinstance(caption, str) else 'N/A'}) out of bounds.")
            continue

        # 3. Filter by Duplicates
        img_hash = calculate_image_hash(full_image_path)

        # Use normalized caption for duplicate checking if needed later
        # For now, use exact match on the raw caption
        normalized_caption = caption.strip() # Basic normalization

        is_duplicate = False
        if img_hash is not None and img_hash in seen_image_hashes:
            skipped_counts["duplicate_image"] += 1
            is_duplicate = True
            # logger.debug(f"Skipping {relative_img_path}: Duplicate image hash ({img_hash}).")

        if normalized_caption in seen_captions:
             # Only count as caption duplicate if not already counted as image duplicate
            if not is_duplicate:
                 skipped_counts["duplicate_caption"] += 1
                 is_duplicate = True
            # logger.debug(f"Skipping {relative_img_path}: Duplicate caption.")

        if is_duplicate:
            continue

        # If all filters passed, add to list and update seen sets
        filtered_list.append(item)
        if img_hash is not None:
            seen_image_hashes.add(img_hash)
        seen_captions.add(normalized_caption)

    logger.info(f"Finished filtering. Kept {len(filtered_list)} items.")
    logger.info(f"Skipped counts: {skipped_counts}")
    return filtered_list

## Main Execution: Preprocessing

In [9]:
#| eval: false
if __name__ == '__main__':
    logger.info("--- Running Data Preprocessing Script (Basic Filtering) ---")

    # --- Configuration ---
    # Define input raw data file(s) - Process Hindi data first
    # We'll add logic later to process Sanskrit/Synthetic if needed
    hindi_raw_jsonl = HINDI_RAW_PATH / 'flickr8k_hindi_raw.jsonl'
    # sanskrit_raw_jsonl = SANSKRIT_RAW_PATH / 'sanskrit_raw.jsonl'
    # synthetic_raw_jsonl = SYNTHETIC_RAW_PATH / 'synthetic_raw.jsonl'

    # Base paths for images corresponding to each source
    hindi_image_base = HINDI_RAW_PATH
    # sanskrit_image_base = SANSKRIT_RAW_PATH # Adjust if images are in subdirs
    # synthetic_image_base = SYNTHETIC_RAW_PATH # Adjust if images are in subdirs

    # --- Load Raw Data ---
    logger.info(f"Loading Hindi raw data from {hindi_raw_jsonl}...")
    raw_hindi_data = load_raw_data(hindi_raw_jsonl)

    # --- Apply Filtering ---
    all_filtered_data = []
    if raw_hindi_data:
        logger.info("Filtering Hindi data...")
        filtered_hindi = filter_data(raw_hindi_data, hindi_image_base)
        all_filtered_data.extend(filtered_hindi)
    else:
        logger.warning("No Hindi raw data loaded, skipping filtering.")

    # TODO: Add similar loading and filtering steps for Sanskrit and Synthetic data
    # when those sources become available. Ensure to use the correct base_image_path.
    # Example:
    # raw_sanskrit_data = load_raw_data(sanskrit_raw_jsonl)
    # if raw_sanskrit_data:
    #     logger.info("Filtering Sanskrit data...")
    #     filtered_sanskrit = filter_data(raw_sanskrit_data, sanskrit_image_base)
    #     all_filtered_data.extend(filtered_sanskrit) # Append keeping track of source if needed

    # --- Save Filtered Data ---
    output_filepath = FILTERED_OUTPUT_PATH / FILTERED_DATA_FILENAME
    logger.info(f"Saving {len(all_filtered_data)} filtered items to {output_filepath}...")
    ensure_dir(FILTERED_OUTPUT_PATH)
    try:
        with open(output_filepath, 'w', encoding='utf-8') as f:
            for item in all_filtered_data:
                f.write(json.dumps(item, ensure_ascii=False) + '\n')
        logger.info(f"Successfully saved filtered data.")
    except Exception as e:
        logger.error(f"Error saving filtered data to {output_filepath}: {e}")

    logger.info("--- Data Preprocessing Script (Basic Filtering) Finished ---")

2025-04-22 10:49:40 - __main__ - INFO - --- Running Data Preprocessing Script (Basic Filtering) ---
2025-04-22 10:49:40 - __main__ - INFO - Loading Hindi raw data from /workspace/indic-clip/data/raw/hindi/flickr8k_hindi_raw.jsonl...
2025-04-22 10:49:40 - __main__ - INFO - Loaded 8090 items from /workspace/indic-clip/data/raw/hindi/flickr8k_hindi_raw.jsonl
2025-04-22 10:49:40 - __main__ - INFO - Filtering Hindi data...
2025-04-22 10:49:40 - __main__ - INFO - Starting filtering process for 8090 raw items...


Filtering Data:   0%|          | 0/8090 [00:00<?, ?it/s]

2025-04-22 10:50:06 - __main__ - INFO - Finished filtering. Kept 8006 items.
2025-04-22 10:50:06 - __main__ - INFO - Skipped counts: {'invalid_entry': 0, 'image_error': 0, 'resolution': 48, 'aspect_ratio': 0, 'text_length': 3, 'duplicate_image': 1, 'duplicate_caption': 32}
2025-04-22 10:50:06 - __main__ - INFO - Saving 8006 filtered items to /workspace/indic-clip/data/processed/filtered_data.jsonl...
2025-04-22 10:50:06 - __main__ - INFO - Successfully saved filtered data.
2025-04-22 10:50:06 - __main__ - INFO - --- Data Preprocessing Script (Basic Filtering) Finished ---


In [None]:
#| hide
import nbdev
# nbdev.nbdev_export() # Run this in terminal to export