In [3]:
import os
import requests

from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_classic.tools.retriever import create_retriever_tool
from langchain_classic import hub
from langchain.agents import create_agent
from langchain_core.prompts import PromptTemplate
from langchain_core.tools import tool
from langgraph.checkpoint.memory import MemorySaver

In [4]:
embeddings = HuggingFaceEmbeddings(model='BAAI/bge-m3', model_kwargs={'device':'cuda'}, encode_kwargs={'batch_size':8})

In [5]:
def create_pdf_retriever(
        pdf_path: str,
        persist_directory: str,
        embedding_model
) -> Chroma.as_retriever:
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    full_text = '\n\n'.join(doc.page_content for doc in docs)
    text_splitter = SemanticChunker(embeddings=embedding_model)
    docs = text_splitter.split_documents([full_text])
    for doc in docs:
        doc.metadata[pdf_path]
    
    vectorstore = Chroma.from_documents(
        persist_directory=persist_directory, documents=docs, embedding=embedding_model
    )

    return vectorstore.as_retriever()