Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,4 @@ spring_ai/target/**
spring_ai/create_user.sql
spring_ai/drop.sql
src/client/spring_ai/target/classes/*
api_server_key
api_server_key
2 changes: 2 additions & 0 deletions src/server/api/utils/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ def load_and_split_documents(
case "png" | "jpg" | "jpeg":
loader = UnstructuredImageLoader(file)
split = False
case "txt":
loader = document_loaders.TextLoader(file)
case _:
raise ValueError(f"{extension} is not a supported file extension")

Expand Down
139 changes: 139 additions & 0 deletions src/server/api/utils/webscrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from bs4 import BeautifulSoup, Comment
import re, unicodedata
from typing import List, Dict, Tuple
import aiohttp

BAD_CHUNKS = [
"nav","header","footer","aside","form","menu","breadcrumb","toc","pagination",
"subscribe","advert","ads","promo","social","share","comment","related","widget",
"modal","banner","cookie","newsletter","disclaimer"
]

def normalize_ws(s: str) -> str:
s = unicodedata.normalize("NFKC", s)
s = re.sub(r"\s+", " ", s)
return s.strip()

def clean_soup(soup: BeautifulSoup) -> None:
for el in soup(["script","style","noscript","template","svg","canvas","iframe"]):
el.decompose()
for c in soup.find_all(string=lambda t: isinstance(t, Comment)):
c.extract()
for tag in soup.find_all(True):
ident = " ".join([
(tag.get("id") or ""),
" ".join(tag.get("class") or []),
(tag.get("role") or "")
]).lower()
if any(b in ident for b in BAD_CHUNKS):
tag.decompose()

def heading_level(tag) -> int:
return int(tag.name[1])

def group_by_sections(soup):
sections = []
for section in soup.find_all(['section', 'article']):
# Use the first heading if present for section title
heading = section.find(re.compile('^h[1-6]$'))
title = normalize_ws(heading.get_text()) if heading else ""
paragraphs = []
for p in section.find_all('p'):
txt = normalize_ws(p.get_text())
if txt:
paragraphs.append(txt)
if paragraphs:
# All paragraphs in the section are joined with blanklines; change as you prefer
sections.append({"title": title, "content": "\n\n".join(paragraphs)})
return sections

def table_to_markdown(table):
# Simple HTML table to Markdown converter
rows = []
for tr in table.find_all("tr"):
cols = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
rows.append(cols)
# Make Markdown
md = ""
if rows:
md += "| " + " | ".join(rows[0]) + " |\n"
md += "| " + " | ".join("---" for _ in rows[0]) + " |\n"
for row in rows[1:]:
md += "| " + " | ".join(row) + " |\n"
return md

def group_by_headings(soup):
grouped = []
# Find all headings
for hdr in soup.find_all(re.compile("^h[1-6]$")):
title = normalize_ws(hdr.get_text())
buffer = []
# Find next siblings until another heading of this or higher level
for sib in hdr.find_next_siblings():
if sib.name and re.match(r"^h[1-6]$", sib.name, re.I):
if int(sib.name[1]) <= int(hdr.name[1]):
break
if sib.name == "p":
text = normalize_ws(sib.get_text())
if text:
buffer.append(text)
elif sib.name in ("ul", "ol"):
for li in sib.find_all('li'):
text = normalize_ws(li.get_text())
if text:
buffer.append("• " + text)
if buffer:
grouped.append({"title": title, "content": "\n\n".join(buffer)})
return grouped

def sections_to_markdown(sections: List[Dict]) -> str:
lines: List[str] = []
for s in sections:
hashes = "#" * max(1, min(6, s["level"]))
lines.append(f"{hashes} {s['title']}")
for p in s["paragraphs"]:
lines.append(p)
lines.append("")
out = "\n".join(lines).strip()
return out + "\n" if out else out

def slugify(text: str, max_len: int = 80) -> str:
text = unicodedata.normalize("NFKD", text)
text = re.sub(r"[^\w\s-]", "", text).strip().lower()
text = re.sub(r"[\s_-]+", "-", text)
return text[:max_len] or "page"

async def fetch_and_extract_paragraphs(url):
paragraphs = []
async with aiohttp.ClientSession() as session:
async with session.get(str(url)) as response:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')

for script in soup(["script", "style"]):
script.decompose()
for element in soup(text=lambda text: isinstance(text, Comment)):
element.extract()

for p in soup.find_all("p"):
txt = normalize_ws(p.get_text())
if txt:
paragraphs.append(txt)
return paragraphs

async def fetch_and_extract_sections(url):
async with aiohttp.ClientSession() as session:
async with session.get(str(url)) as response:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')

for script in soup(["script", "style"]):
script.decompose()
for element in soup(text=lambda text: isinstance(text, Comment)):
element.extract()

# Prefer by section, or fallback to headings
chunks = group_by_sections(soup)
if not chunks:
chunks = group_by_headings(soup)
return chunks
56 changes: 35 additions & 21 deletions src/server/api/v1/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
from fastapi import APIRouter, HTTPException, Response, Header, UploadFile
from fastapi.responses import JSONResponse
from pydantic import HttpUrl
import requests
import aiohttp

import server.api.utils.oci as utils_oci
import server.api.utils.databases as utils_databases
import server.api.utils.embed as utils_embed
import server.api.utils.models as utils_models
import server.api.utils.webscrape as web_parse

from common import functions, schema, logging_config

Expand Down Expand Up @@ -76,26 +77,39 @@ async def store_web_file(
logger.debug("Received store_web_file - request: %s", request)
temp_directory = utils_embed.get_temp_directory(client, "embedding")

# Save the file temporarily
for url in request:
filename = Path(urlparse(str(url)).path).name
request_timeout = 60
logger.debug("Requesting: %s (timeout in %is)", url, request_timeout)
response = requests.get(url, timeout=request_timeout)
content_type = response.headers.get("Content-Type", "").lower()

if "application/pdf" in content_type or "application/octet-stream" in content_type:
with open(temp_directory / filename, "wb") as file:
file.write(response.content)
elif "text" in content_type or "html" in content_type:
with open(temp_directory / filename, "w", encoding="utf-8") as file:
file.write(response.text)
else:
shutil.rmtree(temp_directory)
raise HTTPException(
status_code=500,
detail=f"Unprocessable content type: {content_type}.",
)
async with aiohttp.ClientSession() as session:
for url in request:
filename = Path(urlparse(str(url)).path).name
request_timeout = aiohttp.ClientTimeout(total=60)
logger.debug("Requesting: %s (timeout in %is)", url, request_timeout)
async with session.get(str(url), timeout=request_timeout) as response:
content_type = response.headers.get("Content-Type", "").lower()

if "application/pdf" in content_type or "application/octet-stream" in content_type:
with open(temp_directory / filename, "wb") as file:
file.write(await response.read())

elif "text" in content_type or "html" in content_type:
sections = await web_parse.fetch_and_extract_sections(url)
base = web_parse.slugify(str(url).split('/')[-1]) or "page"
out_files = []
for idx, sec in enumerate(sections, 1):
# filename includes section number and optional slugified title for clarity
stub = web_parse.slugify(sec.get("title", "")) or f"{base}-section{idx}"
sec_filename = f"{stub}.txt"
sec_path = temp_directory / sec_filename
with open(sec_path, "w", encoding="utf-8", errors="replace") as f:
if sec.get("title"):
f.write(sec["title"].strip() + "\n\n")
f.write(str(sec["content"]).strip())
out_files.append(sec_filename)

else:
shutil.rmtree(temp_directory)
raise HTTPException(
status_code=500,
detail=f"Unprocessable content type: {content_type}.",
)

stored_files = [f.name for f in temp_directory.iterdir() if f.is_file()]
return Response(content=json.dumps(stored_files), media_type="application/json")
Expand Down