diff --git a/.gitignore b/.gitignore index e1344955..2521fac8 100644 --- a/.gitignore +++ b/.gitignore @@ -82,4 +82,4 @@ spring_ai/target/** spring_ai/create_user.sql spring_ai/drop.sql src/client/spring_ai/target/classes/* -api_server_key +api_server_key \ No newline at end of file diff --git a/src/server/api/utils/embed.py b/src/server/api/utils/embed.py index be858254..24bd9709 100644 --- a/src/server/api/utils/embed.py +++ b/src/server/api/utils/embed.py @@ -176,6 +176,8 @@ def load_and_split_documents( case "png" | "jpg" | "jpeg": loader = UnstructuredImageLoader(file) split = False + case "txt": + loader = document_loaders.TextLoader(file) case _: raise ValueError(f"{extension} is not a supported file extension") diff --git a/src/server/api/utils/webscrape.py b/src/server/api/utils/webscrape.py new file mode 100644 index 00000000..205e7a40 --- /dev/null +++ b/src/server/api/utils/webscrape.py @@ -0,0 +1,139 @@ +from bs4 import BeautifulSoup, Comment +import re, unicodedata +from typing import List, Dict, Tuple +import aiohttp + +BAD_CHUNKS = [ + "nav","header","footer","aside","form","menu","breadcrumb","toc","pagination", + "subscribe","advert","ads","promo","social","share","comment","related","widget", + "modal","banner","cookie","newsletter","disclaimer" +] + +def normalize_ws(s: str) -> str: + s = unicodedata.normalize("NFKC", s) + s = re.sub(r"\s+", " ", s) + return s.strip() + +def clean_soup(soup: BeautifulSoup) -> None: + for el in soup(["script","style","noscript","template","svg","canvas","iframe"]): + el.decompose() + for c in soup.find_all(string=lambda t: isinstance(t, Comment)): + c.extract() + for tag in soup.find_all(True): + ident = " ".join([ + (tag.get("id") or ""), + " ".join(tag.get("class") or []), + (tag.get("role") or "") + ]).lower() + if any(b in ident for b in BAD_CHUNKS): + tag.decompose() + +def heading_level(tag) -> int: + return int(tag.name[1]) + +def group_by_sections(soup): + sections = [] + for section in soup.find_all(['section', 'article']): + # Use the first heading if present for section title + heading = section.find(re.compile('^h[1-6]$')) + title = normalize_ws(heading.get_text()) if heading else "" + paragraphs = [] + for p in section.find_all('p'): + txt = normalize_ws(p.get_text()) + if txt: + paragraphs.append(txt) + if paragraphs: + # All paragraphs in the section are joined with blanklines; change as you prefer + sections.append({"title": title, "content": "\n\n".join(paragraphs)}) + return sections + +def table_to_markdown(table): + # Simple HTML table to Markdown converter + rows = [] + for tr in table.find_all("tr"): + cols = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])] + rows.append(cols) + # Make Markdown + md = "" + if rows: + md += "| " + " | ".join(rows[0]) + " |\n" + md += "| " + " | ".join("---" for _ in rows[0]) + " |\n" + for row in rows[1:]: + md += "| " + " | ".join(row) + " |\n" + return md + +def group_by_headings(soup): + grouped = [] + # Find all headings + for hdr in soup.find_all(re.compile("^h[1-6]$")): + title = normalize_ws(hdr.get_text()) + buffer = [] + # Find next siblings until another heading of this or higher level + for sib in hdr.find_next_siblings(): + if sib.name and re.match(r"^h[1-6]$", sib.name, re.I): + if int(sib.name[1]) <= int(hdr.name[1]): + break + if sib.name == "p": + text = normalize_ws(sib.get_text()) + if text: + buffer.append(text) + elif sib.name in ("ul", "ol"): + for li in sib.find_all('li'): + text = normalize_ws(li.get_text()) + if text: + buffer.append("• " + text) + if buffer: + grouped.append({"title": title, "content": "\n\n".join(buffer)}) + return grouped + +def sections_to_markdown(sections: List[Dict]) -> str: + lines: List[str] = [] + for s in sections: + hashes = "#" * max(1, min(6, s["level"])) + lines.append(f"{hashes} {s['title']}") + for p in s["paragraphs"]: + lines.append(p) + lines.append("") + out = "\n".join(lines).strip() + return out + "\n" if out else out + +def slugify(text: str, max_len: int = 80) -> str: + text = unicodedata.normalize("NFKD", text) + text = re.sub(r"[^\w\s-]", "", text).strip().lower() + text = re.sub(r"[\s_-]+", "-", text) + return text[:max_len] or "page" + +async def fetch_and_extract_paragraphs(url): + paragraphs = [] + async with aiohttp.ClientSession() as session: + async with session.get(str(url)) as response: + html = await response.text() + soup = BeautifulSoup(html, 'html.parser') + + for script in soup(["script", "style"]): + script.decompose() + for element in soup(text=lambda text: isinstance(text, Comment)): + element.extract() + + for p in soup.find_all("p"): + txt = normalize_ws(p.get_text()) + if txt: + paragraphs.append(txt) + return paragraphs + +async def fetch_and_extract_sections(url): + async with aiohttp.ClientSession() as session: + async with session.get(str(url)) as response: + html = await response.text() + soup = BeautifulSoup(html, 'html.parser') + + for script in soup(["script", "style"]): + script.decompose() + for element in soup(text=lambda text: isinstance(text, Comment)): + element.extract() + + # Prefer by section, or fallback to headings + chunks = group_by_sections(soup) + if not chunks: + chunks = group_by_headings(soup) + return chunks \ No newline at end of file diff --git a/src/server/api/v1/embed.py b/src/server/api/v1/embed.py index 77176356..23d90a23 100644 --- a/src/server/api/v1/embed.py +++ b/src/server/api/v1/embed.py @@ -12,12 +12,13 @@ from fastapi import APIRouter, HTTPException, Response, Header, UploadFile from fastapi.responses import JSONResponse from pydantic import HttpUrl -import requests +import aiohttp import server.api.utils.oci as utils_oci import server.api.utils.databases as utils_databases import server.api.utils.embed as utils_embed import server.api.utils.models as utils_models +import server.api.utils.webscrape as web_parse from common import functions, schema, logging_config @@ -76,26 +77,39 @@ async def store_web_file( logger.debug("Received store_web_file - request: %s", request) temp_directory = utils_embed.get_temp_directory(client, "embedding") - # Save the file temporarily - for url in request: - filename = Path(urlparse(str(url)).path).name - request_timeout = 60 - logger.debug("Requesting: %s (timeout in %is)", url, request_timeout) - response = requests.get(url, timeout=request_timeout) - content_type = response.headers.get("Content-Type", "").lower() - - if "application/pdf" in content_type or "application/octet-stream" in content_type: - with open(temp_directory / filename, "wb") as file: - file.write(response.content) - elif "text" in content_type or "html" in content_type: - with open(temp_directory / filename, "w", encoding="utf-8") as file: - file.write(response.text) - else: - shutil.rmtree(temp_directory) - raise HTTPException( - status_code=500, - detail=f"Unprocessable content type: {content_type}.", - ) + async with aiohttp.ClientSession() as session: + for url in request: + filename = Path(urlparse(str(url)).path).name + request_timeout = aiohttp.ClientTimeout(total=60) + logger.debug("Requesting: %s (timeout in %is)", url, request_timeout) + async with session.get(str(url), timeout=request_timeout) as response: + content_type = response.headers.get("Content-Type", "").lower() + + if "application/pdf" in content_type or "application/octet-stream" in content_type: + with open(temp_directory / filename, "wb") as file: + file.write(await response.read()) + + elif "text" in content_type or "html" in content_type: + sections = await web_parse.fetch_and_extract_sections(url) + base = web_parse.slugify(str(url).split('/')[-1]) or "page" + out_files = [] + for idx, sec in enumerate(sections, 1): + # filename includes section number and optional slugified title for clarity + stub = web_parse.slugify(sec.get("title", "")) or f"{base}-section{idx}" + sec_filename = f"{stub}.txt" + sec_path = temp_directory / sec_filename + with open(sec_path, "w", encoding="utf-8", errors="replace") as f: + if sec.get("title"): + f.write(sec["title"].strip() + "\n\n") + f.write(str(sec["content"]).strip()) + out_files.append(sec_filename) + + else: + shutil.rmtree(temp_directory) + raise HTTPException( + status_code=500, + detail=f"Unprocessable content type: {content_type}.", + ) stored_files = [f.name for f in temp_directory.iterdir() if f.is_file()] return Response(content=json.dumps(stored_files), media_type="application/json")