In [1]:
# !pip install --upgrade pip

In [2]:
# !pip install groq qdrant-client
# !pip install langchain-groq
# !pip install langchainhub
# !pip install langchain_community
# !pip install gradio
# %pip install pydantic<2.0

# Imports

In [3]:
import os
from PyPDF2 import PdfReader
import numpy as np
from langchain_community.vectorstores import Qdrant
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
import numpy as np
from langchain.tools.retriever import create_retriever_tool
from langchain.agents import Tool
from langchain import hub
from langchain.agents import AgentExecutor, create_react_agent
from langchain_community.llms import OpenAI
from langchain_groq import ChatGroq
import gradio as gr
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Extracting Text from PDFs

In [15]:
def extract_text_from_pdf(pdf_path):
    """
    Extract text content from a PDF file.

    Args:
    - pdf_path (str): The path to the PDF file.

    Returns:
    - str: The extracted text content.
    """
    reader = PdfReader(pdf_path)
    extracted_text = ""
    for page in reader.pages:
        extracted_text += page.extract_text()
    return extracted_text

def extract_text_from_pdfs_in_directory(directory):
    """
    Extract text content from all PDF files in a directory and save as text files.

    Args:
    - directory (str): The path to the directory containing PDF files.
    """
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            extracted_text = extract_text_from_pdf(pdf_path)
            txt_filename = os.path.splitext(filename)[0] + ".txt"
            txt_filepath = os.path.join(directory, txt_filename)
            with open(txt_filepath, "w") as txt_file:
                txt_file.write(extracted_text)

# Specify the directory containing PDF files
# directory_path = "Docs/"
directory_path = "../../../../_data/in_pdf/"

# Extract text from PDFs in the directory and save as text files
extract_text_from_pdfs_in_directory(directory_path)


# Generating Embedding 

In [16]:
GROQ_API_KEY = "<GROQ_API_KEY>"

In [18]:
# Step 1: List all .txt files in the directory
directory_path = "../../../../_data/in_pdf/"
txt_files = [file for file in os.listdir(directory_path) if file.endswith('.txt')]
print(txt_files)

['cnpq_tabela-areas-conhecimento.txt', 'Relatório_Autoavaliação_Produtividade Docente_V3.txt']


In [None]:
all_documents = {}
for txt_file in txt_files:
    loader = TextLoader(os.path.join(directory_path, txt_file))
    documents = loader.load()

    # Step 2: Split documents into chunks and add metadata
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100, separator="\n")
    docs = text_splitter.split_documents(documents)
    for doc in docs:
        doc.metadata["source"] = txt_file  # Add source metadata

    all_documents[txt_file] = docs

In [None]:
# Initialize the TextEmbedding model
from langchain.embeddings import HuggingFaceEmbeddings

# Step 3: Initialize the TextEmbedding model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

In [None]:
# Step 4: Create Qdrant vector store collections for each document
qdrant_collections = {}
for txt_file in txt_files:
    qdrant_collections[txt_file] = Qdrant.from_documents(
        all_documents[txt_file],
        embeddings,
        location=":memory:", 
        collection_name=txt_file,
    )

In [None]:
for txt_file in txt_files:
    print(f"Collection: {qdrant_collections[txt_file].collection_name}")

In [None]:
retriever = {}
for txt_file in txt_files:
    retriever[txt_file] = qdrant_collections[txt_file].as_retriever()

# Setting ReAct Agents

In [None]:
def get_age(name: str, person_database: dict) -> int:
    """
    Get the age of a person from the database.

    Args:
    - name (str): The name of the person.
    - person_database (dict): A dictionary containing person information.

    Returns:
    - int: The age of the person if found, otherwise None.
    """
    if name in person_database:
        return person_database[name]["Age"]
    else:
        return None


def get_age_info(name: str) -> str:
    """
    Get age and health information for a person.

    Args:
    - name (str): The name of the person.

    Returns:
    - str: A string containing age and health information for the person.
    """
    person_database = {
        "Sam": {"Age": 21, "Nationality": "US"},
        "Alice": {"Age": 25, "Nationality": "UK"},
        "Bob": {"Age": 11, "Nationality": "US"}
    }
    age = get_age(name, person_database)
    if age is not None:
        return f"\nAge: {age}\n"
    else:
        return f"\nAge Information for {name} not found.\n"
    


def get_today_date(input : str) -> str:
    import datetime
    today = datetime.date.today()
    return f"\n {today} \n"



def get_relevant_document(name : str) -> str:
    # String name for fuzzy search
    search_name = name

    # Find the best match using fuzzy search
    best_match = process.extractOne(search_name, txt_files, scorer=fuzz.ratio)

    # Get the selected file name
    selected_file = best_match[0]
    
    selected_retriever = retriever[selected_file]

    global query
    results = selected_retriever.get_relevant_documents(query)
    global retrieved_text
    
    total_content = "\n\nBelow are the related document's content: \n\n"
    chunk_count = 0
    for result in results:
        chunk_count += 1
        if chunk_count > 4:
            break
        total_content += result.page_content + "\n"
    retrieved_text = total_content
    return total_content


def get_summarized_text(name : str) -> str:
    from transformers import pipeline
    summarizer = pipeline("summarization", model="Falconsai/text_summarization")
    global retrieved_text
    article = retrieved_text
    return summarizer(article, max_length=1000, min_length=30, do_sample=False)[0]['summary_text']


# Define the Tool
get_age_info_tool = Tool(
    name="Get Age",
    func=get_age_info,
    description="Useful for getting age information for any person. Input should be the name of the person."
)

get_today_date_tool = Tool(
    name="Get Todays Date",
    func=get_today_date,
    description="Useful for getting today's date"
)

get_relevant_document_tool = Tool(
    name="Get Relevant document",
    func=get_relevant_document,
    description="Useful for getting relevant document that we need."
)

get_summarized_text_tool = Tool(
    name="Get Summarized Text",
    func=get_summarized_text,
    description="Useful for getting summarized text for any document."
)

# Common Agent prompts

In [None]:
prompt_react = hub.pull("hwchase17/react")
print(prompt_react.template) 

# Running the ReAct agent

In [None]:
retrieved_text = ""
tools = [get_relevant_document_tool, get_summarized_text_tool, get_today_date_tool, get_age_info_tool]

model = ChatGroq(model_name="llama3-70b-8192", groq_api_key=GROQ_API_KEY, temperature=0)
# model = OpenAI(openai_api_key="<YOUR_OPENAI>")

react_agent = create_react_agent(model, tools=tools, prompt=prompt_react)
react_agent_executor = AgentExecutor(
    agent=react_agent, tools=tools, verbose=True, handle_parsing_errors=True
)

In [None]:
query = "Give me the summary for the question : What age requirement is specified for using the OpenAI Services, and what provision applies if the user is under 18?"
react_agent_executor.invoke({"input": query})

In [None]:
query = "Give me summary of What resources does Google offer to users for assistance and guidance in using its services?"
react_agent_executor.invoke({"input": query})

In [None]:
query = "What are my rights to my data on Facebook?"
react_agent_executor.invoke({"input": query})

In [None]:
query = "I am Bob. Will i be eligible in 2027 for the age requirement specified for using the OpenAI Services by OpenAI Terms?"
react_agent_executor.invoke({"input": query})

In [None]:
import gradio as gr
from io import StringIO
import sys
import re

def generate_response(question):
    """
    Generate a response based on the provided question using ChatGroq.

    Args:
    - question (str): The question input by the user.

    Returns:
    - str: The generated response based on the question.
    """
    tools = [get_relevant_document_tool, get_summarized_text_tool, get_today_date_tool, get_age_info_tool]

    model = ChatGroq(model_name="llama3-70b-8192", groq_api_key=GROQ_API_KEY, temperature=0)
    # model = OpenAI(openai_api_key="<YOUR_OPENAI>")

    react_agent = create_react_agent(model, tools=tools, prompt=prompt_react)
    react_agent_executor = AgentExecutor(
        agent=react_agent, tools=tools, verbose=True, handle_parsing_errors=True
    )
    
    # Redirect stdout to capture text
    with StringIO() as text_output:
        sys.stdout = text_output
        completion = react_agent_executor.invoke({"input": question})
        sys.stdout = sys.__stdout__  # Reset stdout
        
        # Get the captured text
        text_output_str = text_output.getvalue()
    
    # Remove ANSI escape codes
    text_output_str = re.sub(r'\x1b\[[0-9;]*m', '', text_output_str)

    return text_output_str

# Set up the Gradio interface
iface = gr.Interface(
    fn=generate_response,
    inputs=[gr.Textbox(label="Question")],  # Pass input as a list
    outputs=[gr.Textbox(label="Generated Response")],  # Pass output as a list
    title="Intellegent RAG with Qdrant, LangChain ReAct and Llama3 from Groq Endpoint",
    description="Enter a question and get a generated response based on the retrieved text.",
)

iface.launch()
