In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings


def get_embeddings_model():
    return HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-small")

In [None]:
from langchain_community.vectorstores import Chroma

db = Chroma(
    persist_directory='../chroma_data',
    collection_name='test_collection',
    embedding_function=get_embeddings_model()
)

In [None]:
len(db)

In [None]:
from urllib.parse import urljoin
from pathlib import Path
from langchain_community.document_loaders import SitemapLoader

file_path = Path("./sitemap-help.xml").absolute()

docs = SitemapLoader(
    file_path,
    is_local=True,
    filter_urls=["https://www.rustore.ru/help/sdk/payments/defold"],
    default_parser="lxml",
    continue_on_failure=True
).load()

In [None]:
import logging
from langchain_community.document_loaders import SitemapLoader

logger = logging.getLogger(__name__)

class SitemapLoaderWithChromium(SitemapLoader):
    async def _fetch(
        self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
    ) -> str:
        """
        Asynchronously scrape the content of a given URL using Playwright's async API.

        Args:
            url (str): The URL to scrape.

        Returns:
            str: The scraped HTML content or an error message if an exception occurs.

        """
        from playwright.async_api import async_playwright

        logger.info("Starting scraping...")
        results = ""
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            try:
                page = await browser.new_page()
                await page.goto(url)
                results = await page.content()  # Simply get the HTML content
                logger.info("Content scraped")
            except Exception as e:
                results = f"Error: {e}"
            await browser.close()
        return results

In [None]:
from pathlib import Path

file_path = Path("./sitemap-help.xml").absolute()

docs = SitemapLoaderWithChromium(
    file_path,
    is_local=True,
    filter_urls=["https://www.rustore.ru/help/sdk/payments/defold"],
    default_parser="lxml",
    continue_on_failure=False
).load()

In [None]:
docs

In [1]:
from langchain_community.chat_models import Replicate

In [None]:
Replicate()