In [1]:
import logging
import os
import re
from pathlib import Path
from typing import Iterator

from backend.parser import rustore_docs_extractor

from bs4 import BeautifulSoup, SoupStrainer
from langchain_community.document_loaders import SitemapLoader
from langchain.indexes import SQLRecordManager, index
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.embeddings import Embeddings
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings

In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [9]:
import nest_asyncio

nest_asyncio.apply()

In [15]:
class SitemapLoaderWithChromium(SitemapLoader):
    def lazy_load(self) -> Iterator[Document]:
        """Load sitemap."""
        if self.is_local:
            try:
                import bs4
            except ImportError:
                raise ImportError(
                    "beautifulsoup4 package not found, please install it"
                    " with `pip install beautifulsoup4`"
                )
            fp = open(self.web_path)
            soup = bs4.BeautifulSoup(fp, "xml")
        else:
            soup = self._scrape(self.web_path, parser="xml")

        els = self.parse_sitemap(soup)

        results = self.scrape_all([el["loc"].strip() for el in els if "loc" in el])

        for i, result in enumerate(results):
            text_content = self.parsing_function(result, els[i]["loc"])
            yield Document(
                page_content=text_content,
                metadata=self.meta_function(els[i], result, text_content),
            )

    async def _fetch(
            self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
    ) -> str:
        """
        Asynchronously scrape the content of a given URL using Playwright's async API.

        Args:
            url (str): The URL to scrape.

        Returns:
            str: The scraped HTML content or an error message if an exception occurs.

        """
        from playwright.async_api import async_playwright

        logger.info("Starting scraping...")
        results = ""
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            try:
                page = await browser.new_page()
                await page.goto(url)
                results = await page.content()  # Simply get the HTML content
                logger.info("Content scraped")
            except Exception as e:
                results = f"Error: {e}"
            await browser.close()
        return results


def metadata_extractor(meta: dict, soup: BeautifulSoup, text_content: str) -> dict:
    title = soup.find("title")
    crumbs = text_content.split('\n')[0]
    description = soup.find("meta", attrs={"name": "description"})
    html = soup.find("html")
    return {
        "crumbs": crumbs,
        "source": meta["loc"],
        "title": title.get_text() if title else crumbs,
        "description": description.get("content", "") if description else "",
        "language": html.get("lang", "") if html else "",
        **meta,
    }

In [16]:
def load_rustore_docs():
    file_path = Path("../data/sitemap-help.xml").absolute()
    return SitemapLoaderWithChromium(
        file_path,
        is_local=True,
        filter_urls=["https://www.rustore.ru/help/sdk/payments/react"],
        parsing_function=rustore_docs_extractor,
        default_parser="lxml",
        bs_kwargs={
            "parse_only": SoupStrainer(
                name=("article", "title", "html", "lang", "content")
            ),
        },
        meta_function=metadata_extractor,
        requests_per_second=1,
    ).load()

In [17]:
docs = load_rustore_docs()

INFO:langchain_community.document_loaders.web_base:fake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.
Fetching pages:   0%|          | 0/4 [00:00<?, ?it/s]INFO:__main__:Starting scraping...
INFO:__main__:Content scraped
INFO:__main__:Starting scraping...
Fetching pages:  25%|##5       | 1/4 [00:01<00:03,  1.32s/it]INFO:__main__:Content scraped
INFO:__main__:Starting scraping...
Fetching pages:  50%|#####     | 2/4 [00:02<00:02,  1.37s/it]INFO:__main__:Content scraped
INFO:__main__:Starting scraping...
Fetching pages:  75%|#######5  | 3/4 [00:04<00:01,  1.49s/it]INFO:__main__:Content scraped
Fetching pages: 100%|##########| 4/4 [00:05<00:00,  1.41s/it]


In [48]:
def split_docs_by_markdown(_docs: [Document]):
    from langchain_text_splitters import MarkdownHeaderTextSplitter
    
    headers_to_split_on = [("##", "header")]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
    
    pattern = r"#[\w-]+"
    
    docs_to_return = []
    
    for doc in _docs:
        new_docs = markdown_splitter.split_text(doc.page_content)
        
        for _new_doc in new_docs:
            _new_doc.metadata |= doc.metadata
            
            if _new_doc.metadata.get('header'):
                anchor = re.findall(pattern, _new_doc.metadata["header"])[0]
                _new_doc.metadata |= dict(source=f'{doc.metadata["source"]}/{anchor}')
        
        docs_to_return.extend(new_docs)
    
    return docs_to_return

In [49]:
transformed_docs = split_docs_by_markdown(docs)

In [51]:
for doc in transformed_docs:
    print(doc.metadata)

{'crumbs': 'Документация SDK | Платежи in-app и подписки | React Native', 'source': 'https://www.rustore.ru/help/sdk/payments/react', 'title': 'Документация RuStore', 'description': '', 'language': 'ru', 'loc': 'https://www.rustore.ru/help/sdk/payments/react', 'changefreq': 'weekly', 'priority': '0.5'}
{'crumbs': 'Документация SDK | Платежи in-app и подписки | React Native | [версия] 3.1.0', 'source': 'https://www.rustore.ru/help/sdk/payments/react/3-1-0', 'title': 'Документация RuStore', 'description': '', 'language': 'ru', 'loc': 'https://www.rustore.ru/help/sdk/payments/react/3-1-0', 'changefreq': 'weekly', 'priority': '0.5'}
{'header': '[#пример-реализации] Пример реализации', 'crumbs': 'Документация SDK | Платежи in-app и подписки | React Native | [версия] 3.1.0', 'source': 'https://www.rustore.ru/help/sdk/payments/react/3-1-0/#пример-реализации', 'title': 'Документация RuStore', 'description': '', 'language': 'ru', 'loc': 'https://www.rustore.ru/help/sdk/payments/react/3-1-0', 'c