In [1]:
import re
from llama_index.core.schema import TransformComponent
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.extractors import TitleExtractor, SummaryExtractor
from sentence_transformers import SentenceTransformer
from llama_index.core import SimpleDirectoryReader
from llama_index.core.schema import MetadataMode
from langchain_huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from pydantic import BaseModel, Field
from typing import List


  from tqdm.autonotebook import tqdm, trange


In [2]:
# Initialize the embedding model
#huggingface_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
#embed_model = LangchainEmbedding(huggingface_embeddings)

# Define a custom transformation component
class CustomTransformation(TransformComponent):
    def __call__(self, nodes, **kwargs):
        for node in nodes:
            node.text = node.text.lower()
            node.text = re.sub(r'\s+', ' ', node.text)  # Replace multiple spaces with a single space
            node.text = re.sub(r'[^\w\s]', '', node.text)  # Removes punctuation
        return nodes

# # Define the embedding model transformation component
# class EmbeddingModel(TransformComponent):
#     def __init__(self):
#         self.model = embed_model

#     def __call__(self, nodes):
#         for node in nodes:
#             node.embedding = self.model.get_text_embedding(node.text)
#         return nodes
    


In [3]:
import os
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

def get_embedding_model(embedding_model_name, embed_batch_size):
    embedding_model = HuggingFaceEmbedding(
            model_name=embedding_model_name,
            embed_batch_size=embed_batch_size
        )
    return embedding_model

class EmbedModel(TransformComponent):
    embedding_model: object = Field(default=None, exclude=True)

    def __init__(self, **data):
        super().__init__(**data)
        self.embedding_model = get_embedding_model(
            embedding_model_name="BAAI/bge-small-en-v1.5",
            embed_batch_size=100
        )

    def __call__(self, nodes: List[object]) -> List[object]:
        for node in nodes:
            node.embedding = self.embedding_model.get_text_embedding(node.text)
        return nodes



In [14]:
def Sentence_Splitter_docs_into_nodes(all_documents):
    try:
        splitter = SentenceSplitter(
            chunk_size=1500,
            chunk_overlap=200
        )

        nodes = splitter.get_nodes_from_documents(all_documents)

        return nodes

    except Exception as e:
        print(f"Error splitting documents into nodes: {e}")
        return []

In [13]:
# Create the ingestion pipeline
pipeline = IngestionPipeline(
    transformations=[
        CustomTransformation(),
        Sentence_Splitter_docs_into_nodes(),
        EmbedModel(),
    ]
)

TypeError: Sentence_Splitter_docs_into_nodes() missing 1 required positional argument: 'all_documents'

In [11]:

if __name__ == '__main__':
    # Load data from directory

        #reader = SimpleDirectoryReader(input_dir=r"C:\Users\pavan\Desktop\Generative AI\RAG-Using-Hybrid-Search-and-Re-Ranker\data")
        documents = SimpleDirectoryReader(input_dir=r"C:\Users\pavan\Desktop\Generative AI\RAG-Using-Hybrid-Search-and-Re-Ranker\data").load_data(show_progress = True)
        print(f"Loaded {len(documents)} documents")
        if documents:
            documents = CustomTransformation(documents)

            # Split documents into nodes
            nodes = Sentence_Splitter_docs_into_nodes(documents)

            # Initialize embedding model
            embeddings = EmbedModel(nodes)
        else:
            print("No documents to process.")

        # Run the ingestion pipeline
        #nodes_parsed = pipeline.run(documents=documents)
        print(f"Created {len(embeddings)} nodes")
        

Loading files: 100%|██████████| 5/5 [00:09<00:00,  1.87s/file]


Loaded 68 documents




TypeError: got an unexpected keyword argument 'show_progress'

In [1]:
import fitz  # PyMuPDF
from uuid import uuid4
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct

ModuleNotFoundError: No module named 'fitz'

In [7]:
from llama_index.core.node_parser import SentenceSplitter

In [2]:
def Sentence_Splitter_docs_into_nodes(all_documents):
    """
    Splits the documents into nodes using a sentence splitter.
    """
    try:
        splitter = SentenceSplitter(
            chunk_size=1500,
            chunk_overlap=200
        )

        nodes = splitter.get_nodes_from_documents(all_documents)

        return nodes

    except Exception as e:
        print(f"Error splitting documents into nodes: {e}")
        return []

In [5]:
import fitz  # PyMuPDF
from uuid import uuid4
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct

def extract_metadata_from_pdf_and_process_nodes(pdf_path, nodes):
    """
    Extract metadata from a PDF and process the nodes with metadata.
    
    Args:
    ----
    pdf_path (str): The path to the PDF file.
    nodes (list): The list of document nodes.
    client (QdrantClient): The Qdrant client instance.
    collection_name (str): The name of the collection.
    """
    # Open the PDF file
    document = fitz.open(pdf_path)
    
    # Extract metadata
    metadata = document.metadata
    
    # Prepare the chunked metadata list
    chunked_metadata = []
    
    for item in nodes:
        content = item['content']  # Assume each node has 'content'
        source = item['metadata']['source']
        page = item['metadata']['page']
        
        id = str(uuid4())

        payload = {
           "page_content": content,
           "metadata": {
                        "id": id,
                        "page_content": content,
                        "source": source,
                        "page": page,
                        "Title": metadata.get('title', 'N/A'),
                        "Author": metadata.get('author', 'N/A'),
                        "CreationDate": metadata.get('creationDate', 'N/A'),
                        }
            }

        metadata_struct = PointStruct(id=id, payload=payload)
        chunked_metadata.append(metadata_struct)
    

In [None]:
def chunked_metadata(data):
    """
    Process and upsert chunked metadata into Qdrant.

    Args:
    ----
    data (list): The list of document chunks.
    client (QdrantClient): The Qdrant client instance.
    collection_name (str): The name of the collection.

    """
    chunked_metadata = []

    for item in data:
        content = item.page_content

        id = str(uuid4())
        source = item.metadata["source"]
        page = item.metadata["page"]

        payload = {
           "page_content": content,
           "metadata": {
                        "id": id,
                        "page_content": content,
                        "source": source,
                        "page": page,
                        }
            }

        metadata = PointStruct(id=id, payload=payload)
        chunked_metadata.append(metadata)


In [11]:
from llama_index.core import SimpleDirectoryReader

In [16]:
path = (r"C:\Users\pavan\Desktop\Generative AI\RAG-Using-Hybrid-Search-and-Re-Ranker\data")
pdf_path = (r"C:\Users\pavan\Desktop\Generative AI\RAG-Using-Hybrid-Search-and-Re-Ranker\data\Adaptive-RAG.pdf")
documents = SimpleDirectoryReader(input_dir=path).load_data()
nodes =  Sentence_Splitter_docs_into_nodes(documents)
extract_metadata_from_pdf_and_process_nodes(pdf_path, nodes)

TypeError: 'TextNode' object is not subscriptable

In [5]:
import logging
from dotenv import load_dotenv
import os
import json
from fastembed import SparseTextEmbedding, TextEmbedding
from qdrant_client import QdrantClient, models

# Load environmental variables from a .env file
load_dotenv()

Qdrant_API_KEY = os.getenv('Qdrant_API_KEY')
Qdrant_URL = os.getenv('Qdrant_URL')
Collection_Name = os.getenv('Collection_Name')

class QdrantIndexing:
    """
    A class for indexing documents using Qdrant vector database.
    """

    def __init__(self) -> None:
        """
        Initialize the QdrantIndexing object.
        """
        self.data_path = r"C:\Users\pavan\Desktop\Generative AI\RAG-Using-Hybrid-Search-and-Re-Ranker\data\nodes.json"
        self.Dense_Embedding_Model = "jinaai/jina-embeddings-v2-base-en	"
        self.Sparse_Embedding_Model = "Qdrant/bm42-all-minilm-l6-v2-attentions"
        self.qdrant_client = QdrantClient(
                            url=Qdrant_URL,
                            api_key=Qdrant_API_KEY)
        self.qdrant_client.set_model(self.Dense_Embedding_Model)
        self.qdrant_client.set_sparse_model(self.Sparse_Embedding_Model)
        self.metadata = []
        self.documents = []
        logging.info("QdrantIndexing object initialized.")

    def load_nodes(self, input_file):
        """
        Load nodes from a JSON file and extract metadata and documents.

        Args:
            input_file (str): The path to the JSON file.
        """
        with open(input_file, 'r') as file:
            self.nodes = json.load(file)

        for node in self.nodes:
            self.metadata.append(node['metadata'])
            self.documents.append(node['text'])

        logging.info(f"Loaded {len(self.nodes)} nodes from JSON file.")

    def client_collection(self):
        """
        Create a collection in Qdrant vector database.
        """
        if not self.qdrant_client.collection_exists(collection_name=f"{Collection_Name}"): 
            self.qdrant_client.create_collection(
                collection_name= Collection_Name,
                vectors_config={
                    "nomic": models.VectorParams(
                        size = 768,
                        distance = models.Distance.COSINE,
                        #optimizers_config=models.OptimizersConfigDiff(memmap_threshold=10000),
                    )
                },
                sparse_vectors_config={
                    "bm42": models.SparseVectorParams(
                        modifier = models.Modifier.IDF,
                    )
                }
            )
            logging.info(f"Created collection '{Collection_Name}' in Qdrant vector database.")

    

In [6]:
vectors_config= qdrant_client.get_fastembed_vector_params(),

In [7]:
vectors_config

({'fast-bge-small-en': VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None)},)

In [2]:
qdrant_client = QdrantClient(
                            url=Qdrant_URL,
                            api_key=Qdrant_API_KEY)