In [1]:
# RAG Document QA System

## Setup and Imports

import time
import os
import groq
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from typing import List, Dict
import PyPDF2
from docx import Document
import textract
from functools import lru_cache

In [2]:
#Set your API keys
groq_api_key = ""
pinecone_api_key = ""


In [3]:
# Initialize Groq client
groq_client = groq.Groq(api_key=groq_api_key)
# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key)
index_name = "rag-doc"

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2') 

  return torch._C._cuda_getDeviceCount() > 0


In [9]:
# Pinecone index setup
cloud = 'aws'
region = 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)

# Create Pinecone index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=spec
    )
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

index = pc.Index(index_name)

print("Initialization complete!")

Initialization complete!


In [10]:
## Functions for Text Extraction and Processing

def extract_text_from_pdf(file_path: str) -> str:
    """
    Extracts text from a PDF file.

    Input:
    - file_path (str): The path to the PDF file.

    Output:
    - str: The extracted text from the PDF.

    This function opens a PDF file, reads all its pages, and extracts the text content.
    It concatenates the text from all pages with newline characters between them.
    """
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
    return text

def extract_text_from_docx(file_path: str) -> str:
    """
    Extracts text from a Word document.

    Input:
    - file_path (str): The path to the Word document.

    Output:
    - str: The extracted text from the document.

    This function opens a Word document and extracts the text content from all paragraphs,
    joining them with newline characters.
    """
    doc = Document(file_path)
    return "\n".join([paragraph.text for paragraph in doc.paragraphs])

def extract_text_from_file(file_path: str) -> str:
    """
    Extracts text from various file types.

    Input:
    - file_path (str): The path to the file.

    Output:
    - str: The extracted text from the file.

    This function determines the file type based on its extension and uses the appropriate
    method to extract text. It supports PDF, DOCX, TXT, and MD files directly, and uses
    textract for other file types.
    """
    file_extension = os.path.splitext(file_path)[1].lower()
    if file_extension == '.pdf':
        return extract_text_from_pdf(file_path)
    elif file_extension == '.docx':
        return extract_text_from_docx(file_path)
    elif file_extension in ['.txt', '.md']:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    else:
        return textract.process(file_path).decode('utf-8')

def load_data(file_path: str) -> List[str]:
    """
    Loads and chunks data from a file.

    Input:
    - file_path (str): The path to the file.

    Output:
    - List[str]: A list of text chunks extracted from the file.

    This function extracts text from the given file, splits it into paragraphs,
    and returns a list of non-empty, stripped text chunks.
    """
    text = extract_text_from_file(file_path)
    chunks = text.split('\n\n')
    return [chunk.strip() for chunk in chunks if chunk.strip()]

@lru_cache(maxsize=1000)
def get_embedding(text: str) -> List[float]:
    """
    Gets the embedding for a given text.

    Input:
    - text (str): The input text to embed.

    Output:
    - List[float]: The embedding vector as a list of floats.

    This function uses the SentenceTransformer model to create an embedding
    for the input text. The result is cached to improve performance for repeated calls.
    """
    return model.encode(text).tolist()

def index_data(chunks: List[str]) -> None:
    """
    Indexes the text chunks in Pinecone.

    Input:
    - chunks (List[str]): A list of text chunks to be indexed.

    Output:
    - None

    This function creates embeddings for each text chunk and upserts them into
    the Pinecone index along with their metadata (the original text).
    """
    vectors = []
    for i, chunk in enumerate(chunks):
        embedding = get_embedding(chunk)
        vectors.append((str(i), embedding, {"text": chunk}))
    index.upsert(vectors=vectors)

@lru_cache(maxsize=100)
def retrieve_relevant_chunks(query: str, top_k: int) -> List[Dict]:
    """
    Retrieves relevant chunks for a given query.

    Input:
    - query (str): The input query.
    - top_k (int): The number of top results to retrieve.

    Output:
    - List[Dict]: A list of dictionaries containing relevant chunks and their metadata.

    This function creates an embedding for the query and uses it to find the most
    similar chunks in the Pinecone index. The results are cached for performance.
    """
    query_embedding = get_embedding(query)
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
    return results['matches']

@lru_cache(maxsize=100)
def generate_answer(query: str, context: str) -> str:
    """
    Generates an answer for a given query and context.

    Input:
    - query (str): The input question.
    - context (str): The context information to base the answer on.

    Output:
    - str: The generated answer.

    This function creates a prompt using the query and context, sends it to the
    Groq language model, and returns the generated answer. Results are cached.
    """
    prompt = f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"
    
    response = groq_client.chat.completions.create(
        model="mixtral-8x7b-32768",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions based on the given context."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150,
        temperature=0.7,
    )
    
    return response.choices[0].message.content.strip()


In [11]:
## Document Processing and Indexing

# Specify the path to your document
file_path = "test.pdf"  # Change this to your document path

# Process and index the document
data_chunks = load_data(file_path)
index_data(data_chunks)

print("Document processed and indexed successfully!")

Document processed and indexed successfully!


In [13]:
def ask_question(question: str, top_k: int = 3) -> None:
    """
    Asks a question and gets an answer based on the indexed document.

    Input:
    - question (str): The question to ask.
    - top_k (int): Number of top chunks to retrieve (default: 3).

    Output:
    - None (prints the question, answer, and relevant chunks)

    This function retrieves relevant chunks from the index, generates an answer
    using the language model, and prints the results including the retrieved chunks.
    """
    relevant_chunks = retrieve_relevant_chunks(question, top_k)
    context = "\n".join([chunk['metadata']['text'] for chunk in relevant_chunks])
    answer = generate_answer(question, context)
    
    print(f"Question: {question}")
    print(f"Answer: {answer}")
    print("\nRetrieved Chunks:")
    for i, chunk in enumerate(relevant_chunks):
        print(f"\nChunk {i+1} (Score: {chunk['score']:.4f}):")
        print(chunk['metadata']['text'])

# Example usage
ask_question("What is the main topic of this document?")

Question: What is the main topic of this document?
Answer: The main topic of this document is the development of an interactive interface for a Question Answering (QA) bot, which allows users to upload documents, ask questions based on the document content, and retrieve real-time answers. The system should handle multiple queries efficiently, provide accurate and contextually relevant responses, and display the retrieved document segments alongside the generated answers. The QA bot is built using a Retrieval-Augmented Generation (RAG) model, a vector database (such as Pinecone DB), and a generative model (like Cohere API) to handle questions related to a provided document or dataset.

Retrieved Chunks:

Chunk 1 (Score: 0.2055):
Part
2:
Interactive
QA
Bot
Interface
Problem
Statement:
Develop
an
interactive
interface
for
the
QA
bot
from
Part
1,
allowing
users
to
input
queries
and
retrieve
answers
in
real
time.
The
interface
should
enable
users
to
upload
documents
and
ask
questions
based
