In [1]:
import os
import json
import re
from llama_index.core.schema import Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SimpleDirectoryReader

In [2]:
def documents_transformation(input_dir: str):
        print(f"Input directory: {input_dir}")
        documents = SimpleDirectoryReader(input_dir=input_dir).load_data()
        print(f"Loaded {len(documents)} documents")
        transformed_documents = []
        for doc in documents:
            transformed_content = doc.get_content().lower()
            transformed_content = re.sub(r'\s+', ' ', transformed_content)
            transformed_content = re.sub(r'[^\w\s]', '', transformed_content)
            transformed_documents.append(Document(text=transformed_content, metadata=doc.metadata))
        print(f"Transformed {len(documents)} documents")
        return transformed_documents

In [3]:
def split_documents_into_nodes(documents, chunk_size, chunk_overlap):
        try:
            splitter = SentenceSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap
            )
            nodes = splitter.get_nodes_from_documents(documents)
            return nodes
        except Exception as e:
            print(f"Error splitting documents into nodes: {e}")
            return []

In [4]:
def save_nodes(nodes):
        try:
            output_file = r"C:\Users\pavan\Desktop\Generative AI\RAG-Automation-Using-Llamaindex-Agents-and-Qdrant\data\nodes.json"
            os.makedirs(os.path.dirname(output_file), exist_ok=True)
            nodes_dict = [node.dict() for node in nodes]
            with open(output_file, 'w') as file:
                json.dump(nodes_dict, file, indent=4)
            print(f"Saved nodes to {output_file}")
        except Exception as e:
            print(f"Error saving nodes to file: {e}")
    


In [5]:
def process_document(input_dir: str, chunk_size: int, chunk_overlap: int) -> None:
        input_dir = rf"{input_dir}"
        documents = documents_transformation(input_dir)
        print("Document Transformation is done")
        nodes = split_documents_into_nodes(documents, chunk_size, chunk_overlap)
        print("Transformed into nodes")
        save_nodes(nodes)
        print("saved the nodes")

In [17]:
def process_documents(input_dir: str) -> None:
        input_dir = rf"{input_dir}"
        documents = documents_transformation(input_dir)
        print("Document Transformation is done")
        
        return documents
 

In [6]:
input_dir = r'C:\Users\pavan\Desktop\Generative AI\RAG-Automation-Using-Llamaindex-Agents-and-Qdrant\data'

In [7]:
process_document(input_dir, 1500, 200)

Input directory: C:\Users\pavan\Desktop\Generative AI\RAG-Automation-Using-Llamaindex-Agents-and-Qdrant\data
Loaded 67 documents
Transformed 67 documents
Document Transformation is done
Transformed into nodes
Saved nodes to C:\Users\pavan\Desktop\Generative AI\RAG-Automation-Using-Llamaindex-Agents-and-Qdrant\data\nodes.json
saved the nodes


In [1]:
from dotenv import load_dotenv
import os
import json
from fastembed import SparseTextEmbedding, TextEmbedding
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import PointStruct, SparseVector
from tqdm import tqdm

from typing import List
import pprint
from colorama import Fore, Back, Style

import logging

# Set up logging
logging.basicConfig(level=logging.INFO)

In [2]:
# Load environmental variables from a .env file
load_dotenv()

Qdrant_API_KEY = os.getenv('Qdrant_API_KEY')
Qdrant_URL = os.getenv('Qdrant_URL')
Collection_Name = os.getenv('Collection_Name')

In [3]:
def load_nodes():
        metadata = []
        documents = []
        payload_file = r'C:\Users\pavan\Desktop\Generative AI\RAG-Automation-Using-Llamaindex-Agents-and-Qdrant\data\nodes.json'

        try:
            with open(payload_file, 'r') as file:
                nodes = json.load(file)

            for node in nodes:
                metadata.append(node['metadata'])
                documents.append(node['text'])

            logging.info(f"Loaded {len(nodes)} the nodes from JSON file")

        except Exception as e:
            logging.error(f"Error loading nodes from JSON file: {e}")
            raise

        return documents, metadata

In [4]:
def client_collection(embedding_model, documents, metadata):
        qdrant_client = QdrantClient(
            url=Qdrant_URL,
            api_key=Qdrant_API_KEY)

        embedding_model = TextEmbedding(model_name=embedding_model)
        sparse_embedding_model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1")
        qdrant_client.set_model(embedding_model)
        qdrant_client.set_sparse_model(sparse_embedding_model)

        try:
            qdrant_client.recreate_collection(
                collection_name="Hybrid_RAG_Collection",
                vectors_config=qdrant_client.get_fastembed_vector_params(),
                sparse_vectors_config=qdrant_client.get_fastembed_sparse_vector_params(),
            )

            ids = qdrant_client.add(
                collection_name="Hybrid_RAG_Collection",
                documents=documents,
                metadata=metadata,
                ids=tqdm(range(len(documents))),
            )

            logging.info(f"Inserted {len(ids)} vectors into Qdrant cluster")

        except Exception as e:
            logging.error(f"Error inserting vectors into Qdrant cluster: {e}")
            raise


In [5]:
def indexing(embedding_model):
        documents, metadata = load_nodes()
        logging.info("Loaded the nodes from json file")
        client_collection(embedding_model, documents, metadata)
        logging.info("Inserted the documents into the Qdrant Cluster")

In [6]:
qdrant_client = QdrantClient(
            url=Qdrant_URL,
            api_key=Qdrant_API_KEY)

In [7]:
qdrant_client.set_model("sentence-transformers/all-MiniLM-L6-v2")
# comment this line to use dense vectors only
qdrant_client.set_sparse_model("prithivida/Splade_PP_en_v1")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [8]:
qdrant_client.recreate_collection(
    collection_name="Hybrid_RAG_Collection",
    vectors_config=qdrant_client.get_fastembed_vector_params(),
    # comment this line to use dense vectors only
    sparse_vectors_config=qdrant_client.get_fastembed_sparse_vector_params(),  
)

  qdrant_client.recreate_collection(
INFO:httpx:HTTP Request: DELETE https://c77ac75e-3a41-4acc-98d2-c9c3eb11b5ea.us-east4-0.gcp.cloud.qdrant.io:6333/collections/Hybrid_RAG_Collection "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: PUT https://c77ac75e-3a41-4acc-98d2-c9c3eb11b5ea.us-east4-0.gcp.cloud.qdrant.io:6333/collections/Hybrid_RAG_Collection "HTTP/1.1 200 OK"


True

In [9]:
import json

input_file = r"C:\Users\pavan\Desktop\Generative AI\RAG-Automation-Using-Llamaindex-Agents-and-Qdrant\data\nodes.json"
metadata = []
documents = []

with open(input_file, 'r') as file:
        nodes = json.load(file)

for node in nodes:
    metadata.append(node['metadata'])
    documents.append(node['text'])

In [10]:
len(documents)

155

In [11]:
from tqdm import tqdm

qdrant_client.add(
    collection_name="Hybrid_RAG_Collection",
    documents=documents,
    metadata=metadata,
    ids=tqdm(range(len(documents))),
)

  0%|          | 0/155 [00:00<?, ?it/s]INFO:httpx:HTTP Request: GET https://c77ac75e-3a41-4acc-98d2-c9c3eb11b5ea.us-east4-0.gcp.cloud.qdrant.io:6333/collections/Hybrid_RAG_Collection "HTTP/1.1 200 OK"
  0%|          | 0/155 [01:44<?, ?it/s]


KeyboardInterrupt: 

In [None]:
indexing(embedding_model = 'sentence-transformers/all-MiniLM-L6-v2')

In [None]:
def DocumentPreprocessingAgent(state: dict) -> OpenAIAgent:

    

    
    

    def done() -> None:
        """When you inserted the vetors into the Qdrant Cluster, call this tool."""
        logging.info("Indexing of the nodes is complete")
        state["current_speaker"] = None
        state["just_finished"] = True

    tools = [
        FunctionTool.from_defaults(fn=indexing),
        FunctionTool.from_defaults(fn=done),
    ]

    system_prompt = (f"""
    You are a helpful assistant that is indexing documents for a retrieval-augmented generation (RAG) system.
    Your task is to index the documents into a Qdrant cluster.
    To do this, you need to know the embedding model to use.
    You can ask the user to supply this.
    If the user supplies the embedding model, call the tool "indexing" with this parameter to index the documents into the Qdrant cluster.
    The current user state is:
    {pprint.pformat(state, indent=4)}
    When you have indexed the documents into the Qdrant cluster, call the tool "done" to signal that you are done.
    If the user asks to do anything other than index the documents, call the tool "done" to signal some other agent should help.
    """)

    return OpenAIAgent.from_tools(
        tools,
        llm=OpenAI(model="gpt-3.5-turbo"),
        system_prompt=system_prompt,
    )
