In [None]:
#!pip install pymongo

In [None]:
#!pip install -U sentence-transformers transformers

In [29]:
import os
import ssl
import certifi
from pymongo import MongoClient, UpdateOne
from pymongo.server_api import ServerApi
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
from io import BytesIO
import numpy as np
import pandas as pd
import torch
from logger import Logger

os.environ['CURL_CA_BUNDLE'] = './certificates/huggingface.co.pem'
ssl._create_default_https_context = ssl.create_default_context(cafile=certifi.where())

class DataIndexer:
    def __init__(self, mongo_uri, db_name, collection_name, text_model, clip_model_id):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        self.client = MongoClient(mongo_uri, server_api=ServerApi('1'))
        self.db = self.client[db_name]
        self.collection = self.db[collection_name]
        self.text_model = SentenceTransformer(text_model, trust_remote_code=True,device=device)
        self.text_model.max_seq_length = 2048
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.clip_model = CLIPModel.from_pretrained(clip_model_id).to(self.device)
        self.processor = CLIPProcessor.from_pretrained(clip_model_id)


    def get_text_embeddings(self,df, column, batch_size=8):
        embeddings = []
        for i in range(0, len(df), batch_size):
            batch = df[column].iloc[i:i+batch_size].tolist()
            batch_embeddings = self.text_model.encode(batch, batch_size=batch_size).tolist()
            embeddings.extend(batch_embeddings)
        return embeddings

    def get_image_embeddings(self, image_paths):
        """Batch process image embeddings for local image paths, ignoring invalid images."""
        images = []
        for image_path in image_paths:
            try:
                if not image_path or pd.isna(image_path):
                    print(f"Warning: Null or invalid image path: {image_path}")
                    continue
                image = Image.open(image_path).convert("RGB")
                images.append(image)
            except (FileNotFoundError, ValueError, OSError) as e:
                print(f"Error processing image {image_path}: {e}")
                continue
        if not images:
            #print("No valid images found for embedding.")
            return []
        try:
            inputs = self.processor(images=images, return_tensors="pt", padding=True).to(self.device)
            image_features = self.clip_model.get_image_features(**inputs)
            return image_features.cpu().detach().numpy().tolist()
        except Exception as e:
            print(f"Error during batch processing of images: {e}")
            return []


    def process_and_index(self, dataframe, unique_columns):
        dataframe['Text Embedding'] =  self.get_text_embeddings(dataframe,'Combined Text',8)
        non_empty_image_paths = dataframe["Sub Image Paths"].dropna()
        image_embeddings = {
        idx: self.get_image_embeddings(paths)
        for idx, paths in non_empty_image_paths.items()
        }
        dataframe["Image Embeddings"] = dataframe.index.map(lambda idx: image_embeddings.get(idx, []))
        records = dataframe.to_dict(orient="records")
        operations = [
            UpdateOne(
                {key: record[key] for key in unique_columns},
                {"$set": record},
                upsert=True
            )
            for record in records
        ]
        if operations:
            result = self.collection.bulk_write(operations)
            print(f"Upserts completed: {result.upserted_count}, Modified: {result.modified_count}")

def main():
    from generic import GenericFunction
    generic = GenericFunction()
    uri = generic.get_value('mongo_uri')
    db_name = generic.get_value('mongo_db')
    collection_name = generic.get_value('mongo_collection')
    text_model = generic.get_value('index_text_model')
    clip_model_id = generic.get_value('index_image_model')
    indexer = DataIndexer(uri, db_name, collection_name, text_model, clip_model_id)
    df = pd.read_csv(generic.get_value('pdf_parsed_location'))
    df['Sub Image Paths'] = df['Sub Image Paths'].apply(
        lambda x: eval(x) if isinstance(x, str) else x
    )
    indexer.process_and_index(df, unique_columns=["PDF Name", "Page Number"])

In [30]:
import os
if __name__ == "__main__":
    main()

2024-11-24 22:44:35,664 [INFO] Loaded config.json from: /Users/deepakn/Desktop/PDF-RAG/config.json
2024-11-24 22:44:35,877 [INFO] Load pretrained SentenceTransformer: jinaai/jina-embeddings-v2-small-en


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

The channel dimension is ambiguous. Got image shape (1, 102, 3). Assuming channels are the first dimension.
The channel dimension is ambiguous. Got image shape (1, 1, 3). Assuming channels are the first dimension.


Error during batch processing of images: mean must have 1 elements if it is an iterable, got 3
Error during batch processing of images: mean must have 1 elements if it is an iterable, got 3


The channel dimension is ambiguous. Got image shape (1, 1, 3). Assuming channels are the first dimension.


Error during batch processing of images: mean must have 1 elements if it is an iterable, got 3
Error processing image images/APP-TIAGO-FINAL-OMSB/page-2/page_2_sub_image_1.jb2: cannot identify image file '/Users/deepakn/Desktop/PDF-RAG/images/APP-TIAGO-FINAL-OMSB/page-2/page_2_sub_image_1.jb2'
Upserts completed: 419, Modified: 0
