In [None]:
import os
import gc
import re
from openai import OpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings


class PDFProcessor:
    def __init__(self):
        self.embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
        self.vector_db_path = "pdf_vector_db"
        os.makedirs(self.vector_db_path, exist_ok=True)

        # OpenRouter API setup
        os.environ["OPENAI_API_KEY"] = "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"
        os.environ["OPENAI_BASE_URL"] = "https://openrouter.ai/api/v1"

        self.client = OpenAI()

    def process_pdf(self, pdf_path):
        """Process PDF and create/update vector DB"""
        db_name = os.path.splitext(os.path.basename(pdf_path))[0]
        vector_path = os.path.join(self.vector_db_path, db_name)

        flag_path = os.path.join(vector_path, "processed.flag")
        if os.path.exists(flag_path):
            print(f"[SKIP] Already processed: {pdf_path}")
            return vector_path

        print(f"[PROCESSING] Creating vector DB for: {pdf_path}")
        try:
            loader = PyPDFLoader(pdf_path)
            documents = loader.load()

            text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
            chunks = text_splitter.split_documents(documents)

            vectordb = Chroma.from_documents(
                documents=chunks,
                embedding=self.embeddings,
                persist_directory=vector_path
            )
            vectordb.persist()

            with open(flag_path, "w") as f:
                f.write("processed")

            print(f"[SUCCESS] Vector DB created at: {vector_path}")
            return vector_path

        except Exception as e:
            print(f"[ERROR] {str(e)}")
            return None
        finally:
            if 'documents' in locals(): del documents
            if 'chunks' in locals(): del chunks
            if 'vectordb' in locals(): del vectordb
            gc.collect()

    def query_document(self, vector_path, question):
        """Retrieve context & generate answer using OpenRouter"""
        if not os.path.exists(vector_path):
            raise ValueError("Vector DB not found at specified path")

        vectordb = Chroma(
            persist_directory=vector_path,
            embedding_function=self.embeddings
        )

        # Retrieve top 3 relevant chunks
        results = vectordb.similarity_search(question, k=3)
        if not results:
            return "No relevant information found."

        context = " ".join([r.page_content for r in results])

        prompt = f"""
You are a helpful assistant. Use the following context to answer:

Context:
{context}

Question: {question}
        """

        response = self.client.chat.completions.create(
            model="mistralai/mixtral-8x7b-instruct",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=250
        )

        return response.choices[0].message.content


if __name__ == "__main__":
    processor = PDFProcessor()

    pdf_path = r"C:\Users\User\Downloads\admini_details_new.pdf"
    vector_path = processor.process_pdf(pdf_path)

    if vector_path:
        while True:
            question = input("\nAsk a question (or 'quit' to exit): ")
            if question.lower() == "quit":
                break
quit
            answer = processor.query_document(vector_path, question)
            print(f"\nAnswer:\n{answer}\n")