In [None]:
#!pip install pymongo

In [None]:
#!pip install -U sentence-transformers transformers

In [25]:
import os
import ssl
import certifi
from pymongo import MongoClient, UpdateOne
from pymongo.server_api import ServerApi
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
from io import BytesIO
import numpy as np
import pandas as pd
import torch
from logger import Logger

os.environ['CURL_CA_BUNDLE'] = './certificates/huggingface.co.pem'
ssl._create_default_https_context = ssl.create_default_context(cafile=certifi.where())

class DataIndexer:
    def __init__(self, mongo_uri, db_name, collection_name, text_model, clip_model_id):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        self.client = MongoClient(mongo_uri, server_api=ServerApi('1'))
        self.db = self.client[db_name]
        self.collection = self.db[collection_name]
        self.text_model = SentenceTransformer(text_model, trust_remote_code=True,device=device)
        self.text_model.max_seq_length = 2048
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.clip_model = CLIPModel.from_pretrained(clip_model_id).to(self.device)
        self.processor = CLIPProcessor.from_pretrained(clip_model_id)


    def get_text_embeddings(self,df, column, batch_size=8):
        embeddings = []
        for i in range(0, len(df), batch_size):
            batch = df[column].iloc[i:i+batch_size].tolist()
            batch_embeddings = self.text_model.encode(batch, batch_size=batch_size).tolist()
            embeddings.extend(batch_embeddings)
        return embeddings

    def get_image_embeddings(self, image_paths):
        """Batch process image embeddings for local image paths, ignoring invalid images."""
        images = []
        for image_path in image_paths:
            try:
                if not image_path or pd.isna(image_path):
                    print(f"Warning: Null or invalid image path: {image_path}")
                    continue
                image = Image.open(image_path).convert("RGB")
                images.append(image)
            except (FileNotFoundError, ValueError, OSError) as e:
                print(f"Error processing image {image_path}: {e}")
                continue
        if not images:
            #print("No valid images found for embedding.")
            return []
        try:
            inputs = self.processor(images=images, return_tensors="pt", padding=True).to(self.device)
            image_features = self.clip_model.get_image_features(**inputs)
            return image_features.cpu().detach().numpy().tolist()
        except Exception as e:
            print(f"Error during batch processing of images: {e}")
            return []


    def process_and_index(self, dataframe, unique_columns):
        dataframe['Text Embedding'] =  self.get_text_embeddings(dataframe,'Combined Text',8)
        non_empty_image_paths = dataframe["Sub Image Paths"].dropna()
        image_embeddings = {
        idx: self.get_image_embeddings(paths)
        for idx, paths in non_empty_image_paths.items()
        }
        dataframe["Image Embeddings"] = dataframe.index.map(image_embeddings.get).apply(lambda x: x if x is not None else [])
        records = dataframe.to_dict(orient="records")
        operations = [
            UpdateOne(
                {key: record[key] for key in unique_columns},
                {"$set": record},
                upsert=True
            )
            for record in records
        ]
        if operations:
            result = self.collection.bulk_write(operations)
            print(f"Upserts completed: {result.upserted_count}, Modified: {result.modified_count}")

def main():
    from generic import GenericFunction
    generic = GenericFunction()
    uri = generic.get_value('mongo_uri')
    db_name = generic.get_value('mongo_db')
    collection_name = generic.get_value('mongo_collection')
    text_model = generic.get_value('index_text_model')
    clip_model_id = generic.get_value('index_image_model')
    indexer = DataIndexer(uri, db_name, collection_name, text_model, clip_model_id)
    df = pd.read_csv(generic.get_value('pdf_parsed_location'))
    df['Sub Image Paths'] = df['Sub Image Paths'].apply(
        lambda x: eval(x) if isinstance(x, str) else x
    )
    indexer.process_and_index(df, unique_columns=["PDF Name", "Page Number"])

In [26]:
import os
if __name__ == "__main__":
    main()

2024-11-24 22:33:16,121 [INFO] Loaded config.json from: /Users/deepakn/Desktop/PDF-RAG/config.json
2024-11-24 22:33:16,218 [INFO] Load pretrained SentenceTransformer: jinaai/jina-embeddings-v2-small-en


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.


The channel dimension is ambiguous. Got image shape (1, 102, 3). Assuming channels are the first dimension.


No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
Error during batch processing of images: mean must have 1 elements if it is an iterable, got 3


The channel dimension is ambiguous. Got image shape (1, 1, 3). Assuming channels are the first dimension.


Error during batch processing of images: mean must have 1 elements if it is an iterable, got 3
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.


The channel dimension is ambiguous. Got image shape (1, 1, 3). Assuming channels are the first dimension.


Error during batch processing of images: mean must have 1 elements if it is an iterable, got 3
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images found for embedding.
No valid images f

TypeError: 'value' must be a scalar, passed: list

In [None]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import os
import ssl
import requests
from sentence_transformers.util import cos_sim
import certifi
import torch
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
import requests
from io import BytesIO
import numpy as np
import pandas as pd
import pytesseract
from transformers import AutoTokenizer

uri = "mongodb+srv://pdfrag6989:cGSRURWmyjm06YDn@cluster0.fh8mk.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
client = MongoClient(uri)
db = client['car_manual']
collection = db['pdf_parsed_data']

In [None]:
client = MongoClient(uri, server_api=ServerApi('1'))
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
model = SentenceTransformer("jinaai/jina-embeddings-v2-small-en",trust_remote_code=True)

In [None]:
df=pd.read_csv("./data_parsed/Combined_Output.csv")
df['']
df.head()

In [None]:
df.info()

In [None]:
df=df[['PDF Name','Page Number','Page Title','combined_Text','All Image Paths','Image Description','Table JSON','Table Description']]
df.head()

In [None]:
df['embeddings'] = df['combined_Text'].apply(lambda x: model.encode(x).tolist())

In [None]:
# Convert DataFrame to list of dictionaries
records = df.to_dict(orient='records')

# Insert records into MongoDB
collection.insert_many(records)
print("Data inserted successfully!")

In [None]:
# Reading the top documents 
documents = list(collection.find())

# Extract embeddings and texts
embeddings = np.array([doc['embeddings'] for doc in documents])
texts = [doc['combined_Text'] for doc in documents]

# New query text and embedding
query = "Airbag Fault"
query_embedding = model.encode(query).reshape(1, -1)

similarities = cosine_similarity(query_embedding, embeddings).flatten()

# Get top N results
top_n = 3
top_indices = similarities.argsort()[-top_n:][::-1]

# Display top results
print("Top similar documents:")
for idx in top_indices:
    print(f"Text: {texts[idx]} | Similarity: {similarities[idx]:.4f}")
    print("\n")