<a href="https://colab.research.google.com/github/oluwafemidiakhoa/MLprject/blob/main/Rag_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qU llama-index
!pip install -qU llama-index-llms-groq
!pip install -qU llama-index-embeddings-huggingface
!pip install -qU llama-index-utils-workflow
!pip install python-dotenv
!pip install pyvis




In [None]:
import os

# Create a directory named 'Data'
os.makedirs('/content/Data', exist_ok=True)


In [None]:
# Move the uploaded PDF file to the 'Data' directory
os.rename('/content/Research.pdf', '/content/Data/Research.pdf')


In [31]:
# Import necessary libraries
import os
from dotenv import load_dotenv
import nest_asyncio
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import NodeWithScore
from llama_index.core.response_synthesizers import CompactAndRefine
from llama_index.core import SimpleDirectoryReader
from llama_index.core.postprocessor.llm_rerank import LLMRerank
from llama_index.core.workflow import (
    Context,
    Workflow,
    StartEvent,
    StopEvent,
    step,
    Event
)
from llama_index.llms.groq import Groq
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.workflow.utils import get_steps_from_class, get_steps_from_instance
from llama_index.utils.workflow import draw_all_possible_flows, draw_most_recent_execution
from google.colab import files

# Step 1: Upload the PDF file
uploaded = files.upload()

# Step 2: Create a directory and move the PDF file to it
os.makedirs('/content/Data', exist_ok=True)

# Move uploaded files to the 'Data' directory
for filename in uploaded.keys():
    os.rename(filename, f'/content/Data/{filename}')

# Step 3: Install required libraries
!pip install -qU llama-index
!pip install -qU llama-index-llms-groq
!pip install -qU llama-index-embeddings-huggingface
!pip install -qU llama-index-utils-workflow
!pip install python-dotenv
!pip install pyvis

# Step 4: Set up environment variables
# Replace <your_api_key_here> with your actual API key
os.environ["GROQ_API_KEY"] = ""

# Load the environment variables
load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")

# Define events
class RetrieverEvent(Event):
    """Result of running retrieval."""
    nodes: list[NodeWithScore]

class RerankEvent(Event):
    """Result of running reranking on retrieved nodes."""
    nodes: list[NodeWithScore]

# Setup the Workflow
class RAGWorkflow(Workflow):
    @step
    async def ingest(self, ctx: Context, ev: StartEvent) -> StopEvent | None:
        """Entry point to ingest documents."""
        dirname = ev.get("dirname")
        if not dirname:
            return None

        documents = SimpleDirectoryReader(dirname).load_data()
        index = VectorStoreIndex.from_documents(
            documents=documents,
            embed_model=HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
        )
        return StopEvent(result=index)

    @step
    async def retrieve(self, ctx: Context, ev: StartEvent) -> RetrieverEvent | None:
        """Entry point for RAG, triggered by a StartEvent with `query`."""
        query = ev.get("query")
        index = ev.get("index")

        if not query:
            return None

        print(f"Query the database with: {query}")

        # Store the query in the global context
        await ctx.set("query", query)

        if index is None:
            print("Index is empty, load some documents before querying!")
            return None

        retriever = index.as_retriever(similarity_top_k=2)
        nodes = await retriever.aretrieve(query)
        print(f"Retrieved {len(nodes)} nodes.")
        return RetrieverEvent(nodes=nodes)

    @step
    async def rerank(self, ctx: Context, ev: RetrieverEvent) -> RerankEvent:
        """Rerank the retrieved nodes."""
        ranker = LLMRerank(
            choice_batch_size=5, top_n=3,
            llm=Groq(model="llama-3.1-70b-versatile")
        )
        print(await ctx.get("query", default=None), flush=True)
        new_nodes = ranker.postprocess_nodes(
            ev.nodes, query_str=await ctx.get("query", default=None)
        )
        print(f"Reranked nodes to {len(new_nodes)}")
        return RerankEvent(nodes=new_nodes)

    @step
    async def synthesize(self, ctx: Context, ev: RerankEvent) -> StopEvent:
        """Generate final response using reranked nodes."""
        llm = Groq(model="llama-3.1-70b-versatile")
        summarizer = CompactAndRefine(llm=llm, streaming=True, verbose=True)
        query = await ctx.get("query", default=None)

        response = await summarizer.asynthesize(query, nodes=ev.nodes)
        return StopEvent(result=response)

# Instantiate the workflow and check steps
workflow = RAGWorkflow()
steps = get_steps_from_class(RAGWorkflow)
if not steps:
    steps = get_steps_from_instance(workflow)

for step_name, step_func in steps.items():
    step_config = getattr(step_func, "__step_config", None)
    print(f"Step: {step_name}, Step Config: {step_config}")

# Run the workflow
nest_asyncio.apply()

# Ingest the documents from the correct directory
index = await workflow.run(dirname="/content/Data")

# Run a query
result = await workflow.run(query="Uncertainties of the mean annual cycle of residual vertical displacements?", index=index)
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

# Visualization
draw_all_possible_flows(RAGWorkflow, filename="multi_step_workflow.html")
draw_most_recent_execution(workflow, filename="rag_flow_recent.html")


Saving Research.pdf to Research.pdf
Step: _done, Step Config: accepted_events=[<class 'llama_index.core.workflow.events.StopEvent'>] event_name='ev' return_types=[<class 'NoneType'>] context_parameter='ctx' num_workers=4 requested_services=[] retry_policy=None
Step: ingest, Step Config: accepted_events=[<class 'llama_index.core.workflow.events.StartEvent'>] event_name='ev' return_types=[<class 'llama_index.core.workflow.events.StopEvent'>] context_parameter='ctx' num_workers=4 requested_services=[] retry_policy=None
Step: rerank, Step Config: accepted_events=[<class '__main__.RetrieverEvent'>] event_name='ev' return_types=[<class '__main__.RerankEvent'>] context_parameter='ctx' num_workers=4 requested_services=[] retry_policy=None
Step: retrieve, Step Config: accepted_events=[<class 'llama_index.core.workflow.events.StartEvent'>] event_name='ev' return_types=[<class '__main__.RetrieverEvent'>] context_parameter='ctx' num_workers=4 requested_services=[] retry_policy=None
Step: synthesiz