In [None]:
import nest_asyncio

nest_asyncio.apply()

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings


def get_embeddings_model():
    return HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-small")

In [2]:
from langchain_community.vectorstores import Chroma

db = Chroma(
    persist_directory='../chroma_data',
    collection_name='test_collection',
    embedding_function=get_embeddings_model()
)

  from tqdm.autonotebook import tqdm, trange


In [9]:
db

{'ids': ['01293c9f-665c-59f6-959b-d75003bcaef4', '05316004-126b-55b4-b81f-56ea6fec7275', '1b0be22f-c6ba-5643-b6ea-a7f2a4c42eae', '1f97ff16-8a33-5c05-b8fd-f243a4811b2d', '20c632cb-569a-53e1-a649-5f8d856085e2', '289849e6-de28-564f-8447-845e0f271f32', '2b009261-0a4a-5d65-a992-a3e1e85092d8', '2de8aad1-2b69-5ea8-936a-dd952d107e85', '304f2d62-2848-5411-8e50-1bcebe7f2d38', '32888bc6-78f2-5680-98da-4ecf36c0782e', '34838b82-3c64-5eeb-9e47-2c8e37324303', '37d21564-dc99-53ed-b601-81f623b7cfae', '418ff7c0-01d2-5d71-93f7-cdd9dfc704a8', '45010149-f1a1-585d-8c07-1ef0c377e044', '4c128a6b-8a34-5c75-8e6a-2a6458d826dd', '52b6a059-39cf-596e-a82f-e805cd4706cc', '58d516f9-7668-51bb-bd16-d15c90a1bc53', '69f0408d-497e-56e4-850d-71138599f3e7', '74340055-9681-52c8-ab93-ecd73f79dfbe', '77eef81b-f647-5f1a-be05-2cd965f66f35', '7c72d887-eeb6-504f-aae5-fc1862d061ce', '89f2e1ee-f963-523f-981f-0301caeffa28', '8e7e8743-7905-5815-b93e-7b5b307c6c95', '9438cd3f-0408-5533-800b-c9bc3563eaea', 'a5595e25-2304-53eb-b451-7041ed

In [None]:
from urllib.parse import urljoin
from pathlib import Path
from langchain_community.document_loaders import SitemapLoader

file_path = Path("./sitemap-help.xml").absolute()

docs = SitemapLoader(
    file_path,
    is_local=True,
    filter_urls=["https://www.rustore.ru/help/sdk/payments/defold"],
    default_parser="lxml",
    continue_on_failure=True
).load()

In [None]:
import logging
from langchain_community.document_loaders import SitemapLoader

logger = logging.getLogger(__name__)

class SitemapLoaderWithChromium(SitemapLoader):
    async def _fetch(
        self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
    ) -> str:
        """
        Asynchronously scrape the content of a given URL using Playwright's async API.

        Args:
            url (str): The URL to scrape.

        Returns:
            str: The scraped HTML content or an error message if an exception occurs.

        """
        from playwright.async_api import async_playwright

        logger.info("Starting scraping...")
        results = ""
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            try:
                page = await browser.new_page()
                await page.goto(url)
                results = await page.content()  # Simply get the HTML content
                logger.info("Content scraped")
            except Exception as e:
                results = f"Error: {e}"
            await browser.close()
        return results

In [None]:
from pathlib import Path

file_path = Path("./sitemap-help.xml").absolute()

docs = SitemapLoaderWithChromium(
    file_path,
    is_local=True,
    filter_urls=["https://www.rustore.ru/help/sdk/payments/defold"],
    default_parser="lxml",
    continue_on_failure=False
).load()

In [None]:
docs

In [1]:
from langchain_community.chat_models import Replicate

In [None]:
Replicate()