Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions src/server/utils/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Copyright (c) 2024, 2025, Oracle and/or its affiliates.
Licensed under the Universal Permissive License v1.0 as shown at http://oss.oracle.com/licenses/upl.
"""
# spell-checker:ignore langchain, docstore, docos, vectorstores, oraclevs
# spell-checker:ignore langchain, docstore, docos, vectorstores, oraclevs, genai, hnsw

import json
import copy
Expand All @@ -22,7 +22,7 @@
from langchain_core.language_models.chat_models import BaseChatModel
from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import HTMLSectionSplitter, CharacterTextSplitter
from langchain_text_splitters import HTMLHeaderTextSplitter, CharacterTextSplitter

import server.utils.databases as databases

Expand Down Expand Up @@ -130,20 +130,19 @@ def split_document(
("h4", "Header 4"),
("h5", "Header 5"),
]
html_splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_split_on)
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
##################################
# Splitters - End
##################################
match extension.lower():
case "pdf":
doc_split = text_splitter.split_documents(document)
case "html":
try:
html_split = html_splitter.split_documents(document)
except Exception as ex:
logger.exception(ex)
html_split = document
doc_split = text_splitter.split_documents(html_split)
tmp_meta = document[0].metadata
doc_split = html_splitter.split_text(document[0].page_content)
# Update metadata with source
for doc in doc_split:
doc.metadata.update(tmp_meta)
case "pdf" | "md" | "txt" | "csv":
doc_split = text_splitter.split_documents(document)
case _:
Expand Down Expand Up @@ -180,7 +179,8 @@ def load_and_split_documents(
case "pdf":
loader = document_loaders.PyPDFLoader(file)
case "html":
loader = document_loaders.UnstructuredHTMLLoader(file)
# Use TextLoader to preserve for header split
loader = document_loaders.TextLoader(file)
case "md":
loader = document_loaders.TextLoader(file)
case "csv":
Expand Down