In [41]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
import os

def load_documents_from_directory(directory_path):
    documents = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            if file.endswith('.pdf'):
                loader = PyPDFLoader(file_path)
            elif file.endswith('.docx'):
                loader = Docx2txtLoader(file_path)
            else:
                continue  # Skip unsupported file formats
            documents.extend(loader.load())
    return documents

docs = load_documents_from_directory("contracts/trust-probate")

In [42]:
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300)
documents = text_splitter.split_documents(docs)
db = Chroma.from_documents(persist_directory='./trust-probate',documents=documents,embedding=OpenAIEmbeddings(model="text-embedding-3-large"))

In [43]:
doc_sources = [doc.metadata['source']  for doc in docs]
doc_sources

['contracts/trust-probate/Assignment of Personal Property to Trust.docx',
 'contracts/trust-probate/Auto Trust.docx',
 'contracts/trust-probate/Affidavit of No Estate Tax Due.docx',
 'contracts/trust-probate/Ratification of Trust Agreement.docx']

In [44]:
query = "I want to start a non-profit"
docs = db.similarity_search(query,k=1)

In [45]:
docs[0].metadata['source']


'contracts/trust-probate/Auto Trust.docx'