In [None]:
import openai
import fitz  # PyMuPDF
import pytesseract
import os
from dotenv import load_dotenv
import chromadb
from chromadb.config import Settings
from tqdm import tqdm
import pandas as pd

In [None]:
# Load environment variables from .env file
load_dotenv()

In [None]:
# Initialize OpenAI API key
openai.api_key = os.getenv('OPENAI_API_KEY')

In [None]:
# Define paths
input_path = os.getenv('INPUT_PATH')
output_excel_path = os.getenv('OUTPUT_PATH') + 'Output.xlsx'

In [None]:
# Initialize ChromaDB client
client = chromadb.Client(Settings())

In [None]:
# Check if collection exists; if not, create it
collection_name = "document_embeddings"
collection = client.get_or_create_collection(name=collection_name)

In [None]:
# Function to extract text from images using OCR
def extract_text_from_image(image):
    try:
        return pytesseract.image_to_string(image)
    except Exception as e:
        print(f"Error extracting text from image: {e}")
        return ""

In [None]:
# Function to read and extract content from documents
def read_documents(input_path):
    documents = {}
    try:
        for file_name in os.listdir(input_path):
            if file_name.endswith('.pdf'):
                file_path = os.path.join(input_path, file_name)
                document = fitz.open(file_path)
                document_text = []
                for page_number in range(len(document)):
                    page = document.load_page(page_number)
                    text = page.get_text()
                    # If the text extraction fails or is empty, use OCR
                    if not text.strip():
                        pix = page.get_pixmap()
                        text = extract_text_from_image(pix)
                    document_text.append((page_number + 1, text))
                documents[file_name] = document_text
    except Exception as e:
        print(f"Error reading documents: {e}")
    return documents

In [None]:
# Function to create and store embeddings using OpenAI API
def create_and_store_embeddings(documents):
    ids = []
    texts = []
    metadata = []
    try:
        for file_name, pages in tqdm(documents.items(), desc="Reading Documents"):
            for page_number, text in pages:
                response = openai.Embedding.create(
                    model="text-embedding-ada-002",
                    input=text
                )
                doc_id = f"{file_name}_{page_number}"
                ids.append(doc_id)
                texts.append(text)
                metadata.append({"file_name": file_name, "page_number": page_number})
        
        # Upsert embeddings into ChromaDB
        collection.upsert(
            documents=texts,
            ids=ids,
            metadatas=metadata
        )
    except Exception as e:
        print(f"Error creating and storing embeddings: {e}")

In [None]:
# Function to find the most relevant answer based on query
def find_answer(query):
    try:
        results = collection.query(
            query_texts=[query],
            n_results=1  # Number of closest matches to return; adjust if needed
        )
        
        # Extract IDs and metadata
        if 'ids' in results and len(results['ids']) > 0:
            best_match_id = results['ids'][0][0]  # Get the ID of the best match
            best_match_metadata = results['metadatas'][0][0]  # Get the metadata of the best match
            return best_match_id, best_match_metadata
    except Exception as e:
        print(f"Error finding answer: {e}")
    return None, None

In [None]:
# Function to get context from the best match
def get_context(best_match_metadata):
    if best_match_metadata:
        try:
            file_name = best_match_metadata.get('file_name')
            page_number = best_match_metadata.get('page_number')
            return file_name, page_number
        except Exception as e:
            print(f"Error getting context: {e}")
    return None, None

In [None]:
# Function to get a presentable answer using OpenAI LLM
def get_presentable_answer(query, context, documents):
    try:
        file_name, page_number = context
        for file_name_key, pages in documents.items():
            if file_name_key == file_name:
                for page_num, text in pages:
                    if page_num == page_number:
                        response = openai.ChatCompletion.create(
                            model="gpt-3.5-turbo",
                            messages=[
                                {"role": "system", "content": "You are a helpful assistant."},
                                {"role": "user", "content": f"Answer the following query based on the context provided:\n\nQuery: {query}\n\nContext: {text}"}
                            ],
                            max_tokens=512
                        )
                        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        print(f"Error getting presentable answer: {e}")
    return "No context found."


In [None]:
# Function to run the DocumentQA
def document_qa(query, documents):
    try:
        best_match_id, best_match_metadata = find_answer(query)
        
        if best_match_metadata:
            context = get_context(best_match_metadata)
            if context:
                presentable_answer = get_presentable_answer(query, context, documents)
                return {
                    "answer": presentable_answer,
                    "file_name": context[0],
                    "page_number": context[1]
                }
    except Exception as e:
        print(f"Error in document_qa: {e}")
    return {
        "answer": "No relevant answer found.",
        "file_name": None,
        "page_number": None
    }

In [None]:
# Function to save results to Excel file
def save_results_to_excel(results, output_excel_path):
    df = pd.DataFrame(results, columns=["Question", "Answer", "PDF Name", "Page no"])
    
    try:
        with pd.ExcelWriter(output_excel_path, mode='a', if_sheet_exists='overlay') as writer:
            df.to_excel(writer, index=False, header=False)
    except FileNotFoundError:
        df.to_excel(output_excel_path, index=False)  # Create file with header


In [None]:
# Main function to run the script
def main(input_path):
    try:
        # Read and process documents
        documents = read_documents(input_path)
        
        # Create and store embeddings
        create_and_store_embeddings(documents)
        
        results = []
        while True:
            query = input("\nEnter your query (type 'exit' to quit): ")
            if query.lower() == 'exit':
                print("\nSaving answers...")
                save_results_to_excel(results, output_excel_path)
                print(f"Output created at {output_excel_path}")
                break
            result = document_qa(query, documents)
            results.append([query, result['answer'], result['file_name'], result['page_number']])
            
            print("\n" + "="*40)
            print(f"Question: {query}")
            print(f"Answer: {result['answer']}")
            print(f"File: {result['file_name']}")
            print(f"Page: {result['page_number']}")
            print("="*40)
    except Exception as e:
        print(f"Error in main: {e}")

In [None]:
# Execute the main function
if __name__ == "__main__":
    main(input_path)
