From b75e261dfb8cff8bc2634912236330b082341a29 Mon Sep 17 00:00:00 2001 From: ZoranPandovski Date: Wed, 8 May 2024 16:58:14 +0200 Subject: [PATCH 01/11] Initial lint and cleanup --- .flake8 | 2 - .../handlers/web_handler/__about__.py | 2 +- .../handlers/web_handler/urlcrawl_helpers.py | 83 +------------------ .../handlers/web_handler/web_handler.py | 35 ++------ 4 files changed, 11 insertions(+), 111 deletions(-) diff --git a/.flake8 b/.flake8 index 0171ea7e66d..cf360dcde2f 100644 --- a/.flake8 +++ b/.flake8 @@ -89,11 +89,9 @@ exclude = mindsdb/integrations/handlers/quickbooks_handler/* mindsdb/integrations/handlers/strava_handler/* mindsdb/integrations/handlers/strava_handler/* - mindsdb/integrations/handlers/web_handler/* mindsdb/integrations/handlers/strava_handler/* mindsdb/integrations/handlers/github_handler/* mindsdb/integrations/handlers/vitess_handler/* - mindsdb/integrations/handlers/web_handler/* mindsdb/integrations/handlers/impala_handler/* mindsdb/integrations/handlers/tdengine_handler/* mindsdb/integrations/handlers/huggingface_api_handler/* diff --git a/mindsdb/integrations/handlers/web_handler/__about__.py b/mindsdb/integrations/handlers/web_handler/__about__.py index 9600d54c970..12fdd95dcaf 100644 --- a/mindsdb/integrations/handlers/web_handler/__about__.py +++ b/mindsdb/integrations/handlers/web_handler/__about__.py @@ -6,4 +6,4 @@ __github__ = 'https://github.com/mindsdb/mindsdb' __pypi__ = 'https://pypi.org/project/mindsdb/' __license__ = 'MIT' -__copyright__ = 'Copyright 2022- mindsdb' +__copyright__ = 'Copyright 2022 - MindsDB' diff --git a/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py b/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py index d839711a6ca..0f8a2024a94 100644 --- a/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +++ b/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py @@ -1,6 +1,5 @@ import concurrent.futures import io -import re import traceback from threading import Lock from urllib.parse import urljoin, urlparse @@ -13,22 +12,19 @@ logger = log.getLogger(__name__) + def pdf_to_markdown(response): - # Download the PDF from the given URL file_stream = io.BytesIO(response.content) - # Open the PDF from the in-memory file document = fitz.open(stream=file_stream, filetype="pdf") markdown_text = "" for page_num in range(len(document)): page = document.load_page(page_num) - # Get the blocks of text blocks = page.get_text("blocks") - # Sort the blocks by their vertical position on the page blocks.sort(key=lambda block: (block[1], block[0])) previous_block_bottom = 0 @@ -46,21 +42,15 @@ def pdf_to_markdown(response): markdown_text += "\n" - # Close the document document.close() return markdown_text - -url_list_lock = Lock() - - def is_valid(url): parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) -# this bad boy gets all the crawling done in parallel def parallel_get_all_website_links(urls): url_contents = {} @@ -79,18 +69,16 @@ def parallel_get_all_website_links(urls): url_contents[url] = future.result() except Exception as exc: logger.error(f'{url} generated an exception: {exc}') - + return url_contents -# this crawls one individual website def get_all_website_links(url): logger.info("crawling: {url} ...".format(url=url)) urls = set() domain_name = urlparse(url).netloc try: - # Create a session to handle cookies session = requests.Session() # Add headers to mimic a real browser request @@ -98,9 +86,7 @@ def get_all_website_links(url): "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" } - # Send GET request response = session.get(url, headers=headers) - # Accept cookies if necessary if "cookie" in response.request.headers: session.cookies.update(response.cookies) @@ -156,10 +142,8 @@ def get_all_website_links(url): def get_readable_text_from_soup(soup): - # Start formatting as Markdown markdown_output = "" - # Iterate through headings and paragraphs for tag in soup.find_all( ["h1", "h2", "h3", "h4", "h5", "h6", "p", "a", "ul", "ol", "li"] ): @@ -183,14 +167,12 @@ def get_readable_text_from_soup(soup): return markdown_output -# this bad girl does the recursive crawling of the websites def get_all_website_links_rec(url, reviewd_urls, limit=None): if limit is not None: if len(reviewd_urls) >= limit: return reviewd_urls if url not in reviewd_urls: - # if something happens getting the website links for this url then log the error try: reviewd_urls[url] = get_all_website_links(url) except Exception as e: @@ -213,6 +195,7 @@ def get_all_website_links_rec(url, reviewd_urls, limit=None): continue # insert immediately to count limit between threads. fill later + url_list_lock = Lock() with url_list_lock: if limit is None or len(reviewd_urls) < limit: reviewd_urls[new_url] = {} @@ -220,7 +203,6 @@ def get_all_website_links_rec(url, reviewd_urls, limit=None): else: break - # if there is something to fetch, go fetch if len(to_rev_url_list) > 0: new_revised_urls = parallel_get_all_website_links(to_rev_url_list) @@ -230,21 +212,9 @@ def get_all_website_links_rec(url, reviewd_urls, limit=None): get_all_website_links_rec(new_url, reviewd_urls, limit) -# this crawls the websites and returns it all as a dataframe, ready to be served def get_all_websites(urls, limit=1, html=False): reviewd_urls = {} - # def fetch_url(url): - # url = url.rstrip('/') - # if urlparse(url).scheme == "": - # # Try HTTPS first - # url = "https://" + url - # reviewd_urls_iter = {} - # get_all_website_links_rec(url, reviewd_urls_iter, limit) - # return reviewd_urls_iter - - # reviewd_urls = fetch_url(urls[0]) - # Define a helper function that will be run in parallel. def fetch_url(url): # Allow URLs to be passed wrapped in quotation marks so they can be used # directly from the SQL editor. @@ -256,7 +226,6 @@ def fetch_url(url): url = "https://" + url get_all_website_links_rec(url, reviewd_urls, limit) - # Use a ThreadPoolExecutor to run the helper function in parallel. with concurrent.futures.ThreadPoolExecutor() as executor: future_to_url = {executor.submit(fetch_url, url): url for url in urls} @@ -269,62 +238,18 @@ def fetch_url(url): df = dict_to_dataframe( reviewd_urls, columns_to_ignore=columns_to_ignore, index_name="url" ) - + print('get_all_websites', df) if not df.empty and df[df.error.isna()].empty: - # no real data - rise exception from first row raise Exception(str(df.iloc[0].error)) return df -# this can parse the native query -def parse_urls_limit(input_str): - # Split the input string into 'url', 'limit' or 'html' parts - items = re.split(r",", input_str) - - # Initialize list for urls, limit and html - urls = [] - limit = None - html = False - - for item in items: - item = item.strip() # Remove leading/trailing whitespace - - # Check if item is a 'limit' or 'html' setting - if item.lower().startswith("limit"): - limit_match = re.search(r"\d+", item) - if limit_match: - limit = int(limit_match.group()) # Extract the number - elif item.lower().startswith("html"): - html_match = re.search(r"(true|false)", item, re.I) - if html_match: - html = ( - html_match.group().lower() == "true" - ) # Check if the value is 'true' - else: - urls.append(item) # Add the item to the url list - - return {"urls": urls, "limit": limit, "html": html} - - -# run a query that goes and crawls urls -# format url, url, ..., limit=n -# you can pass one of many urls, limit is optional -def get_df_from_query_str(query_str): - args = parse_urls_limit(query_str) - df = get_all_websites(args["urls"], args["limit"], args["html"]) - return df - - -# this flips a dictionary of dictionaries into a dataframe so we can use it in mindsdb def dict_to_dataframe(dict_of_dicts, columns_to_ignore=None, index_name=None): - # Convert dictionary of dictionaries into DataFrame df = pd.DataFrame.from_dict(dict_of_dicts, orient="index") - # If columns_to_ignore is provided, drop these columns if columns_to_ignore: df = df.drop(columns_to_ignore, axis=1, errors="ignore") - # If index_name is provided, rename the index if index_name: df.index.name = index_name diff --git a/mindsdb/integrations/handlers/web_handler/web_handler.py b/mindsdb/integrations/handlers/web_handler/web_handler.py index 43faef7236a..1cf01b897a7 100644 --- a/mindsdb/integrations/handlers/web_handler/web_handler.py +++ b/mindsdb/integrations/handlers/web_handler/web_handler.py @@ -13,7 +13,7 @@ from mindsdb.utilities.security import is_private_url from mindsdb.utilities.config import Config -from .urlcrawl_helpers import get_df_from_query_str, get_all_websites +from .urlcrawl_helpers import get_all_websites class CrawlerTable(APITable): @@ -25,7 +25,7 @@ def select(self, query: ast.Select) -> pd.DataFrame: for op, arg1, arg2 in conditions: if op == 'or': - raise NotImplementedError(f'OR is not supported') + raise NotImplementedError('OR is not supported') if arg1 == 'url': url = arg2 @@ -38,20 +38,17 @@ def select(self, query: ast.Select) -> pd.DataFrame: else: urls = url else: - raise NotImplementedError( - f'url can be url = "someurl", you can also crawl multiple sites, as follows:' - f' url IN ("url1", "url2", ..)' - ) - + raise NotImplementedError('Invalid URL format. Please provide a single URL like url = "example.com" or' + 'multiple URLs using the format url IN ("url1", "url2", ...)') else: pass if len(urls) == 0: raise NotImplementedError( - f'You must specify what url you want to crawl, for example: SELECT * FROM crawl WHERE url IN ("someurl", ..)') + 'You must specify what url you want to crawl, for example: SELECT * FROM crawl WHERE url = "someurl"') if query.limit is None: - raise NotImplementedError(f'You must specify a LIMIT which defines the number of pages to crawl') + raise NotImplementedError('You must specify a LIMIT which defines the number of pages to crawl') limit = query.limit.value if limit < 0: @@ -85,29 +82,9 @@ class WebHandler(APIHandler): """A class for handling crawling content from websites. Attributes: - """ def __init__(self, name=None, **kwargs): super().__init__(name) - - self.api = None - self.is_connected = True crawler = CrawlerTable(self) self._register_table('crawler', crawler) - - def check_connection(self) -> StatusResponse: - - response = StatusResponse(False) - response.success = True - - return response - - def native_query(self, query_string: str = None): - - df = get_df_from_query_str(query_string) - - return Response( - RESPONSE_TYPE.TABLE, - data_frame=df - ) From 65cb6c7fa5f315987ce9b52b71b57a6bc81ed665 Mon Sep 17 00:00:00 2001 From: ZoranPandovski Date: Wed, 8 May 2024 16:58:21 +0200 Subject: [PATCH 02/11] Improve README --- .../handlers/web_handler/README.md | 91 ++++++++++++++----- 1 file changed, 66 insertions(+), 25 deletions(-) diff --git a/mindsdb/integrations/handlers/web_handler/README.md b/mindsdb/integrations/handlers/web_handler/README.md index 60f1e188f82..cf24de406db 100644 --- a/mindsdb/integrations/handlers/web_handler/README.md +++ b/mindsdb/integrations/handlers/web_handler/README.md @@ -1,54 +1,95 @@ -# Build your Web crawler +--- +title: Web Crawler +sidebarTitle: Web Crawler +--- -This integration allows you to query the results of a crawler in SQL: +In this section, we present how to use a web crawler within MindsDB. -- This can be particularly useful for building A/Q systems from data on a website. +A web crawler is an automated script designed to systematically browse and index content on the internet. Within MindsDB, you can utilize a web crawler to efficiently collect data from various websites. -Note that this crawler can crawl every single sub-site from the original. +## Prerequisites -Let's see in action +Before proceeding, ensure the following prerequisites are met: + +1. Install MindsDB locally via [Docker](/setup/self-hosted/docker) or [Docker Desktop](/setup/self-hosted/docker-desktop). +2. To use Web Crawler with MindsDB, install the required dependencies following [this instruction](/setup/self-hosted/docker#install-dependencies). + +## Connection + +This handler does not require any connection parameters. + +Here is how to initialize a web crawler: ```sql --- Should be able to create a web crawler database CREATE DATABASE my_web -With - ENGINE = 'web'; +WITH ENGINE = 'web'; ``` + +The above query creates a database called `my_web`. This database by default have a table called `crawler` that we can use to crawl data from a given url/urls. + -This creates a database called my_web. This database ships with a table called crawler that we can use to crawl data given some url/urls. +## Usage + +Specifying a query `LIMIT` is required. To crawl all pages on a site, consider setting the limit to a high value, such as 10,000, which exceeds the expected number of pages. Be aware that setting a higher limit may result in longer response times. + -## Searching for web content in SQL +### Get Websites Content -Let's get the content of a docs.mindsdb.com website: +The following usage examples demonstrate how to retrieve content from `docs.mindsdb.com`: ```sql -SELECT - * +SELECT * FROM my_web.crawler -WHERE - url = 'docs.mindsdb.com' +WHERE url = 'docs.mindsdb.com' LIMIT 1; ``` +You can also retrieve content from internal pages. The following query fetches the content from 10 internal pages: + +```sql +SELECT * +FROM my_web.crawler +WHERE url = 'docs.mindsdb.com' +LIMIT 10; +``` + +Another option is to get the content from multiple websites by using the `IN ()` operator: -This should return the contents of docs.mindsdb.com. +```sql +SELECT * +FROM my_web.crawler +WHERE url IN ('docs.mindsdb.com', 'docs.python.org') +LIMIT 1; +``` +### Get PDF Content -Now, let's assume we want to search for the content on multiple websites. +MindsDB accepts [file uploads](/sql/create/file) of `csv`, `xlsx`, `xls`, `sheet`, `json`, and `parquet`. However, you can also configure the web crawler to fetch data from PDF files accessible via URLs. ```sql -SELECT - * +SELECT * FROM my_web.crawler -WHERE - url IN ('docs.mindsdb.com', 'docs.python.org') -LIMIT 30; +WHERE url = '' +LIMIT 1; ``` -This command will crawl two sites and stop when the results count hits 30. The total count of rows in the result will be 30. +## Troubleshooting + + +`Web crawler encounters character encoding issues` + +* **Symptoms**: Extracted text appears garbled or contains strange characters instead of the expected text. +* **Checklist**: + 1. Open a GitHub Issue: If you encounter a bug or a repeatable error with encoding, + report it on the [MindsDB GitHub](https://github.com/mindsdb/mindsdb/issues) repository by opening an issue. + -NOTE: limit is mandatory. If you want to crawl all pages on the site, you can pass a big number in the limit (for example, 10000), more than the expected count of pages on the site. -However, a big limit also increases the time waiting for a response. + +`Web crawler times out while trying to fetch content` +* **Symptoms**: The crawler fails to retrieve data from a website, resulting in timeout errors. +* **Checklist**: + 1. Check the network connection to ensure the target site is reachable. + \ No newline at end of file From 79a75fabea9f141b531f408da1f2ba29e14123cd Mon Sep 17 00:00:00 2001 From: ZoranPandovski Date: Wed, 8 May 2024 16:58:27 +0200 Subject: [PATCH 03/11] Improve docs --- .../app-integrations/web-crawler.mdx | 41 +++++++++++++++---- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/docs/integrations/app-integrations/web-crawler.mdx b/docs/integrations/app-integrations/web-crawler.mdx index 0b02c181ce6..cf24de406db 100644 --- a/docs/integrations/app-integrations/web-crawler.mdx +++ b/docs/integrations/app-integrations/web-crawler.mdx @@ -5,16 +5,14 @@ sidebarTitle: Web Crawler In this section, we present how to use a web crawler within MindsDB. -A web crawler is a computer program or automated script that browses the internet and navigates through websites, web pages, and web content to gather data. Within the realm of MindsDB, a web crawler can be employed to harvest data, which can be used to train models, -domain specific chatbots or fine-tune LLMs. +A web crawler is an automated script designed to systematically browse and index content on the internet. Within MindsDB, you can utilize a web crawler to efficiently collect data from various websites. ## Prerequisites Before proceeding, ensure the following prerequisites are met: 1. Install MindsDB locally via [Docker](/setup/self-hosted/docker) or [Docker Desktop](/setup/self-hosted/docker-desktop). -2. To connect Web Crawler to MindsDB, install the required dependencies following [this instruction](/setup/self-hosted/docker#install-dependencies). -3. Install or ensure access to Web Crawler. +2. To use Web Crawler with MindsDB, install the required dependencies following [this instruction](/setup/self-hosted/docker#install-dependencies). ## Connection @@ -26,12 +24,19 @@ Here is how to initialize a web crawler: CREATE DATABASE my_web WITH ENGINE = 'web'; ``` + +The above query creates a database called `my_web`. This database by default have a table called `crawler` that we can use to crawl data from a given url/urls. + ## Usage + +Specifying a query `LIMIT` is required. To crawl all pages on a site, consider setting the limit to a high value, such as 10,000, which exceeds the expected number of pages. Be aware that setting a higher limit may result in longer response times. + + ### Get Websites Content -Here is how to get the content of `docs.mindsdb.com`: +The following usage examples demonstrate how to retrieve content from `docs.mindsdb.com`: ```sql SELECT * @@ -40,7 +45,7 @@ WHERE url = 'docs.mindsdb.com' LIMIT 1; ``` -You can also get the content of internal pages. Here is how to fetch the content from 10 internal pages: +You can also retrieve content from internal pages. The following query fetches the content from 10 internal pages: ```sql SELECT * @@ -49,7 +54,7 @@ WHERE url = 'docs.mindsdb.com' LIMIT 10; ``` -Another option is to get the content from multiple websites. +Another option is to get the content from multiple websites by using the `IN ()` operator: ```sql SELECT * @@ -60,7 +65,7 @@ LIMIT 1; ### Get PDF Content -MindsDB accepts [file uploads](/sql/create/file) of `csv`, `xlsx`, `xls`, `sheet`, `json`, and `parquet`. However, you can utilize the web crawler to fetch data from `pdf` files. +MindsDB accepts [file uploads](/sql/create/file) of `csv`, `xlsx`, `xls`, `sheet`, `json`, and `parquet`. However, you can also configure the web crawler to fetch data from PDF files accessible via URLs. ```sql SELECT * @@ -69,4 +74,22 @@ WHERE url = '' LIMIT 1; ``` -For example, you can provide a link to a `pdf` file stored in Amazon S3. +## Troubleshooting + + +`Web crawler encounters character encoding issues` + +* **Symptoms**: Extracted text appears garbled or contains strange characters instead of the expected text. +* **Checklist**: + 1. Open a GitHub Issue: If you encounter a bug or a repeatable error with encoding, + report it on the [MindsDB GitHub](https://github.com/mindsdb/mindsdb/issues) repository by opening an issue. + + + + +`Web crawler times out while trying to fetch content` + +* **Symptoms**: The crawler fails to retrieve data from a website, resulting in timeout errors. +* **Checklist**: + 1. Check the network connection to ensure the target site is reachable. + \ No newline at end of file From 0a9fa1c86d75cdbb77c0b31d27b7721135d629fa Mon Sep 17 00:00:00 2001 From: ZoranPandovski Date: Thu, 9 May 2024 13:20:16 +0200 Subject: [PATCH 04/11] Re write helper functions --- .../handlers/web_handler/urlcrawl_helpers.py | 193 ++++++++++++------ 1 file changed, 128 insertions(+), 65 deletions(-) diff --git a/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py b/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py index 0f8a2024a94..fb67a0b9213 100644 --- a/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +++ b/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py @@ -2,8 +2,9 @@ import io import traceback from threading import Lock -from urllib.parse import urljoin, urlparse +from urllib.parse import urljoin, urlparse, urlunparse +import html2text import fitz # PyMuPDF import pandas as pd import requests @@ -13,13 +14,28 @@ logger = log.getLogger(__name__) -def pdf_to_markdown(response): +def pdf_to_markdown(response, gap_threshold=10): + """ + Convert a PDF document to Markdown text. - file_stream = io.BytesIO(response.content) + Args: + response: the response object containing the PDF data + gap_threshold (int): the vertical gap size that triggers a new line in the output (default 10) - document = fitz.open(stream=file_stream, filetype="pdf") + Returns: + A string containing the converted Markdown text. - markdown_text = "" + Raises: + Exception -- if the PDF data cannot be processed. + """ + + try: + file_stream = io.BytesIO(response.content) + document = fitz.open(stream=file_stream, filetype="pdf") + except Exception as e: + raise Exception("Failed to process PDF data: " + str(e)) + + markdown_lines = [] for page_num in range(len(document)): page = document.load_page(page_num) @@ -34,24 +50,45 @@ def pdf_to_markdown(response): block_text = block[4] # Check if there's a large vertical gap between this block and the previous one - if y0 - previous_block_bottom > 10: - markdown_text += "\n" + if y0 - previous_block_bottom > gap_threshold: + markdown_lines.append("") - markdown_text += block_text + "\n" + markdown_lines.append(block_text) previous_block_bottom = y1 - markdown_text += "\n" + markdown_lines.append("") document.close() - return markdown_text + return "\n".join(markdown_lines) + +def is_valid(url) -> bool: + """ + Check if a URL is valid. + + Args: + url: the URL to check -def is_valid(url): + Returns: + bool: True if the URL is valid, False otherwise. + """ parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) -def parallel_get_all_website_links(urls): +def parallel_get_all_website_links(urls) -> dict: + """ + Fetch all website links from a list of URLs. + + Args: + urls (list): a list of URLs to fetch links from + + Returns: + A dictionary mapping each URL to a list of links found on that URL. + + Raises: + Exception: if an error occurs while fetching links from a URL. + """ url_contents = {} if len(urls) <= 10: @@ -69,12 +106,22 @@ def parallel_get_all_website_links(urls): url_contents[url] = future.result() except Exception as exc: logger.error(f'{url} generated an exception: {exc}') + # don't raise the exception, just log it, continue processing other urls return url_contents -def get_all_website_links(url): - logger.info("crawling: {url} ...".format(url=url)) +def get_all_website_links(url) -> dict: + """ + Fetch all website links from a URL. + + Args: + url (str): the URL to fetch links from + + Returns: + A dictionary containing the URL, the extracted links, the HTML content, the text content, and any error that occurred. + """ + logger.info("rawling: {url} ...".format(url=url)) urls = set() domain_name = urlparse(url).netloc @@ -107,10 +154,7 @@ def get_all_website_links(url): continue href = urljoin(url, href) parsed_href = urlparse(href) - - href = ( - parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path - ) + href = urlunparse((parsed_href.scheme, parsed_href.netloc, parsed_href.path, '', '', '')) if not is_valid(href): continue if href in urls: @@ -141,44 +185,42 @@ def get_all_website_links(url): } -def get_readable_text_from_soup(soup): - markdown_output = "" - - for tag in soup.find_all( - ["h1", "h2", "h3", "h4", "h5", "h6", "p", "a", "ul", "ol", "li"] - ): - if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: - markdown_output += ( - "#" * int(tag.name[1]) + " " + tag.get_text().strip() + "\n\n" - ) - elif tag.name == "p": - markdown_output += tag.get_text().strip() + "\n\n" - elif tag.name == "a": - markdown_output += f"[{tag.get_text().strip()}]({tag.get('href')})\n\n" - elif tag.name == "ul": - for li in tag.find_all("li"): - markdown_output += f"* {li.get_text().strip()}\n" - markdown_output += "\n" - elif tag.name == "ol": - for index, li in enumerate(tag.find_all("li")): - markdown_output += f"{index + 1}. {li.get_text().strip()}\n" - markdown_output += "\n" - - return markdown_output - - -def get_all_website_links_rec(url, reviewd_urls, limit=None): +def get_readable_text_from_soup(soup) -> str: + """ + Extract readable text from a BeautifulSoup object and convert it to Markdown. + + Args: + soup (BeautifulSoup): a BeautifulSoup object + + Returns: + The extracted text in Markdown format. + """ + html_converter = html2text.HTML2Text() + html_converter.ignore_links = False + return html_converter.handle(str(soup)) + +def get_all_website_links_recursively(url, reviewed_urls, limit=None): + """ + Recursively gathers all links from a given website up to a specified limit. + + Args: + url (str): The starting URL to fetch links from. + reviewed_urls (dict): A dictionary to keep track of reviewed URLs and associated data. + limit (int, optional): The maximum number of URLs to process. + + TODO: Refactor this function to use a iterative aproach instead of recursion + """ if limit is not None: - if len(reviewd_urls) >= limit: - return reviewd_urls + if len(reviewed_urls) >= limit: + return reviewed_urls - if url not in reviewd_urls: + if url not in reviewed_urls: try: - reviewd_urls[url] = get_all_website_links(url) + reviewed_urls[url] = get_all_website_links(url) except Exception as e: error_message = traceback.format_exc().splitlines()[-1] logger.error("An exception occurred: %s", str(e)) - reviewd_urls[url] = { + reviewed_urls[url] = { "url": url, "urls": [], "html_content": "", @@ -189,16 +231,16 @@ def get_all_website_links_rec(url, reviewd_urls, limit=None): to_rev_url_list = [] # create a list of new urls to review that don't exist in the already reviewed ones - for new_url in reviewd_urls[url]["urls"]: + for new_url in reviewed_urls[url]["urls"]: # if this is already in the urls, then no need to go and crawl for it - if new_url in reviewd_urls or new_url in to_rev_url_list: + if new_url in reviewed_urls or new_url in to_rev_url_list: continue # insert immediately to count limit between threads. fill later url_list_lock = Lock() with url_list_lock: - if limit is None or len(reviewd_urls) < limit: - reviewd_urls[new_url] = {} + if limit is None or len(reviewed_urls) < limit: + reviewed_urls[new_url] = {} to_rev_url_list.append(new_url) else: break @@ -206,15 +248,24 @@ def get_all_website_links_rec(url, reviewd_urls, limit=None): if len(to_rev_url_list) > 0: new_revised_urls = parallel_get_all_website_links(to_rev_url_list) - reviewd_urls.update(new_revised_urls) + reviewed_urls.update(new_revised_urls) for new_url in new_revised_urls: - get_all_website_links_rec(new_url, reviewd_urls, limit) + get_all_website_links_recursively(new_url, reviewed_urls, limit) + +def get_all_websites(urls, limit=1, html=False) -> pd.DataFrame: + """ + Crawl a list of websites and return a DataFrame containing the results. -def get_all_websites(urls, limit=1, html=False): - reviewd_urls = {} + Args: + urls (list): a list of URLs to crawl + html (bool): a boolean indicating whether to include the HTML content in the results + Returns: + A DataFrame containing the results. + """ + reviewed_urls = {} def fetch_url(url): # Allow URLs to be passed wrapped in quotation marks so they can be used # directly from the SQL editor. @@ -224,8 +275,9 @@ def fetch_url(url): if urlparse(url).scheme == "": # Try HTTPS first url = "https://" + url - get_all_website_links_rec(url, reviewd_urls, limit) + get_all_website_links_recursively(url, reviewed_urls, limit) + # Use a ThreadPoolExecutor to run the helper function in parallel. with concurrent.futures.ThreadPoolExecutor() as executor: future_to_url = {executor.submit(fetch_url, url): url for url in urls} @@ -236,21 +288,32 @@ def fetch_url(url): if html is False: columns_to_ignore += ["html_content"] df = dict_to_dataframe( - reviewd_urls, columns_to_ignore=columns_to_ignore, index_name="url" + reviewed_urls, columns_to_ignore=columns_to_ignore, index_name="url" ) - print('get_all_websites', df) + if not df.empty and df[df.error.isna()].empty: raise Exception(str(df.iloc[0].error)) return df - -def dict_to_dataframe(dict_of_dicts, columns_to_ignore=None, index_name=None): +def dict_to_dataframe(dict_of_dicts, columns_to_ignore=None, index_name=None) -> pd.DataFrame: + """ + Convert a dictionary of dictionaries to a DataFrame. + + Args: + dict_of_dicts (dict): a dictionary of dictionaries + columns_to_ignore (list): a list of columns to ignore + index_name (str): the name of the index column + Returns: + A DataFrame containing the data. + """ df = pd.DataFrame.from_dict(dict_of_dicts, orient="index") if columns_to_ignore: - df = df.drop(columns_to_ignore, axis=1, errors="ignore") + for column in columns_to_ignore: + if column in df.columns: + df = df.drop(column, axis=1) if index_name: df.index.name = index_name - return df + return df \ No newline at end of file From 88c49a396cfdc839b6a3188a73ac1395ddaa5fa9 Mon Sep 17 00:00:00 2001 From: ZoranPandovski Date: Thu, 9 May 2024 13:27:31 +0200 Subject: [PATCH 05/11] Lint --- .../handlers/web_handler/urlcrawl_helpers.py | 10 +++++++--- .../integrations/handlers/web_handler/web_handler.py | 5 ----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py b/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py index fb67a0b9213..2d7d166cee7 100644 --- a/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +++ b/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py @@ -62,6 +62,7 @@ def pdf_to_markdown(response, gap_threshold=10): return "\n".join(markdown_lines) + def is_valid(url) -> bool: """ Check if a URL is valid. @@ -199,6 +200,7 @@ def get_readable_text_from_soup(soup) -> str: html_converter.ignore_links = False return html_converter.handle(str(soup)) + def get_all_website_links_recursively(url, reviewed_urls, limit=None): """ Recursively gathers all links from a given website up to a specified limit. @@ -208,7 +210,7 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None): reviewed_urls (dict): A dictionary to keep track of reviewed URLs and associated data. limit (int, optional): The maximum number of URLs to process. - TODO: Refactor this function to use a iterative aproach instead of recursion + TODO: Refactor this function to use a iterative aproach instead of recursion """ if limit is not None: if len(reviewed_urls) >= limit: @@ -266,6 +268,7 @@ def get_all_websites(urls, limit=1, html=False) -> pd.DataFrame: A DataFrame containing the results. """ reviewed_urls = {} + def fetch_url(url): # Allow URLs to be passed wrapped in quotation marks so they can be used # directly from the SQL editor. @@ -295,10 +298,11 @@ def fetch_url(url): raise Exception(str(df.iloc[0].error)) return df + def dict_to_dataframe(dict_of_dicts, columns_to_ignore=None, index_name=None) -> pd.DataFrame: """ Convert a dictionary of dictionaries to a DataFrame. - + Args: dict_of_dicts (dict): a dictionary of dictionaries columns_to_ignore (list): a list of columns to ignore @@ -316,4 +320,4 @@ def dict_to_dataframe(dict_of_dicts, columns_to_ignore=None, index_name=None) -> if index_name: df.index.name = index_name - return df \ No newline at end of file + return df diff --git a/mindsdb/integrations/handlers/web_handler/web_handler.py b/mindsdb/integrations/handlers/web_handler/web_handler.py index 1cf01b897a7..885b377dcad 100644 --- a/mindsdb/integrations/handlers/web_handler/web_handler.py +++ b/mindsdb/integrations/handlers/web_handler/web_handler.py @@ -5,11 +5,6 @@ from mindsdb.integrations.libs.api_handler import APIHandler, APITable from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions, project_dataframe -from mindsdb.integrations.libs.response import ( - HandlerStatusResponse as StatusResponse, - HandlerResponse as Response, - RESPONSE_TYPE -) from mindsdb.utilities.security import is_private_url from mindsdb.utilities.config import Config From a6d5a7edfc7fa82331ec2ff1bcbfbd8b462e2793 Mon Sep 17 00:00:00 2001 From: ZoranPandovski Date: Thu, 9 May 2024 14:07:52 +0200 Subject: [PATCH 06/11] Add missing dependency --- mindsdb/integrations/handlers/web_handler/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mindsdb/integrations/handlers/web_handler/requirements.txt b/mindsdb/integrations/handlers/web_handler/requirements.txt index feefb08c354..e01687458b6 100644 --- a/mindsdb/integrations/handlers/web_handler/requirements.txt +++ b/mindsdb/integrations/handlers/web_handler/requirements.txt @@ -1,2 +1,3 @@ bs4 -pymupdf \ No newline at end of file +pymupdf +html2text \ No newline at end of file From 0ce014876810d3b0cb86bed542bc720266295239 Mon Sep 17 00:00:00 2001 From: ZoranPandovski Date: Thu, 9 May 2024 16:39:52 +0200 Subject: [PATCH 07/11] Add unit tests for web handler --- .github/workflows/test_on_push.yml | 3 +- .../handlers/web_handler/requirements.txt | 1 - .../handlers/web_handler/tests/__init__.py | 0 .../web_handler/tests/example_data.py | 18 --- .../web_handler/tests/test_helpers.py | 49 ------- .../handlers/web_handler/web_handler.py | 45 +++--- tests/unit/handlers/test_web.py | 130 ++++++++++++++++++ 7 files changed, 154 insertions(+), 92 deletions(-) delete mode 100644 mindsdb/integrations/handlers/web_handler/tests/__init__.py delete mode 100644 mindsdb/integrations/handlers/web_handler/tests/example_data.py delete mode 100644 mindsdb/integrations/handlers/web_handler/tests/test_helpers.py create mode 100644 tests/unit/handlers/test_web.py diff --git a/.github/workflows/test_on_push.yml b/.github/workflows/test_on_push.yml index ba5c45719fd..8e3eb0df371 100644 --- a/.github/workflows/test_on_push.yml +++ b/.github/workflows/test_on_push.yml @@ -119,6 +119,7 @@ jobs: pip install mindsdb[mssql] pip install mindsdb[clickhouse] pip install mindsdb[snowflake] + pip install mindsdb[web] pip freeze - name: Run unit tests run: | @@ -133,7 +134,7 @@ jobs: fi - name: Run Handlers tests and submit Coverage to coveralls run: | - handlers=("mysql" "postgres" "mssql" "clickhouse" "snowflake") + handlers=("mysql" "postgres" "mssql" "clickhouse" "snowflake" "web") for handler in "${handlers[@]}" do pytest --cov=mindsdb/integrations/handlers/${handler}_handler tests/unit/handlers/test_${handler}.py diff --git a/mindsdb/integrations/handlers/web_handler/requirements.txt b/mindsdb/integrations/handlers/web_handler/requirements.txt index e01687458b6..062cdd061ad 100644 --- a/mindsdb/integrations/handlers/web_handler/requirements.txt +++ b/mindsdb/integrations/handlers/web_handler/requirements.txt @@ -1,3 +1,2 @@ -bs4 pymupdf html2text \ No newline at end of file diff --git a/mindsdb/integrations/handlers/web_handler/tests/__init__.py b/mindsdb/integrations/handlers/web_handler/tests/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/mindsdb/integrations/handlers/web_handler/tests/example_data.py b/mindsdb/integrations/handlers/web_handler/tests/example_data.py deleted file mode 100644 index 2b2708d351a..00000000000 --- a/mindsdb/integrations/handlers/web_handler/tests/example_data.py +++ /dev/null @@ -1,18 +0,0 @@ -PDF_CONTENT = ( - b"%PDF-1.7\n\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n\n2 0 obj\n<< /Type /Pages " - b"/Kids [3 0 R] /Count 1 >>\nendobj\n\n3 0 obj\n<< /Type /Page /Parent 2 0 R /Contents 4 0 R " - b">>\nendobj\n\n4 0 obj\n<< /Length 22 >>\nstream\nBT\n/Helvetica 12 Tf\n1 0 0 1 50 700 Tm\n(" - b"Hello, this is a test!) Tj\nET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 " - b"f\n0000000010 00000 n\n0000000077 00000 n\n0000000122 00000 n\n0000000203 00000 n\n0000000277 " - b"00000 n\ntrailer\n<< /Size 5 /Root 1 0 R >>\nstartxref\n343\n%%EOF\n " -) - -BROKEN_PDF_CONTENT = b"%PDF-1.4\n\nThis is not a valid PDF file content\n" - -HTML_SAMPLE_1 = "

Heading One

Heading Two

  • item1
  • item2
  • item3
" - -MARKDOWN_SAMPLE_1 = "# Heading One\n\n## Heading Two\n\n* item1\n* item2\n* item3\n\n" - -HTML_SAMPLE_2 = '

Heading

text

link
      item1
' - -MARKDOWN_SAMPLE_2 = "### Heading\n\ntext\n\n[link](https://google.com)\n\n\n\n" diff --git a/mindsdb/integrations/handlers/web_handler/tests/test_helpers.py b/mindsdb/integrations/handlers/web_handler/tests/test_helpers.py deleted file mode 100644 index f67e64bcf7e..00000000000 --- a/mindsdb/integrations/handlers/web_handler/tests/test_helpers.py +++ /dev/null @@ -1,49 +0,0 @@ -from unittest.mock import patch - -import pytest -from bs4 import BeautifulSoup -from fitz.fitz import FileDataError - -from mindsdb.integrations.handlers.web_handler import urlcrawl_helpers as C -from mindsdb.integrations.handlers.web_handler.tests import example_data as D - - -class TestPDFToMarkdownTest: - @patch("requests.Response") - def test_pdf_to_markdown(self, mock_response) -> None: - response = mock_response.return_value - response.content = D.PDF_CONTENT - result = C.pdf_to_markdown(response) - assert "Hello, this is a test!" in result - - @patch("requests.Response") - def test_broken_pdf_to_markdown(self, mock_response) -> None: - response = mock_response.return_value - response.content = D.BROKEN_PDF_CONTENT - - with pytest.raises(FileDataError) as excinfo: - C.pdf_to_markdown(response) - - assert str(excinfo.value) == "cannot open broken document" - - -@pytest.mark.parametrize( - "url, result", - [ - ("google", False), - ("google.com", False), - ("https://google.com", True), - ("", False), - ], -) -def test_url_validation(url: str, result: bool) -> None: - assert C.is_valid(url) == result - - -@pytest.mark.parametrize( - "html, markdown", - [(D.HTML_SAMPLE_1, D.MARKDOWN_SAMPLE_1), (D.HTML_SAMPLE_2, D.MARKDOWN_SAMPLE_2)], -) -def test_get_readable_text_from_soup(html: str, markdown: str) -> None: - soup = BeautifulSoup(html, "html.parser") - assert markdown == C.get_readable_text_from_soup(soup) diff --git a/mindsdb/integrations/handlers/web_handler/web_handler.py b/mindsdb/integrations/handlers/web_handler/web_handler.py index 885b377dcad..e1613cd502e 100644 --- a/mindsdb/integrations/handlers/web_handler/web_handler.py +++ b/mindsdb/integrations/handlers/web_handler/web_handler.py @@ -14,40 +14,38 @@ class CrawlerTable(APITable): def select(self, query: ast.Select) -> pd.DataFrame: + """ + Selects data from the provided websites + Args: + query (ast.Select): Given SQL SELECT query + + Returns: + dataframe: Dataframe containing the crawled data + + Raises: + NotImplementedError: If the query is not supported + """ conditions = extract_comparison_conditions(query.where) urls = [] - for op, arg1, arg2 in conditions: - - if op == 'or': + for operator, arg1, arg2 in conditions: + if operator == 'or': raise NotImplementedError('OR is not supported') - if arg1 == 'url': - url = arg2 - - if op == '=': - urls = [str(url)] - elif op == 'in': - if type(url) == str: - urls = [str(url)] - else: - urls = url + if operator in ['=', 'in']: + urls = [str(arg2)] if isinstance(arg2, str) else arg2 else: raise NotImplementedError('Invalid URL format. Please provide a single URL like url = "example.com" or' 'multiple URLs using the format url IN ("url1", "url2", ...)') - else: - pass if len(urls) == 0: raise NotImplementedError( 'You must specify what url you want to crawl, for example: SELECT * FROM crawl WHERE url = "someurl"') if query.limit is None: - raise NotImplementedError('You must specify a LIMIT which defines the number of pages to crawl') - limit = query.limit.value + raise NotImplementedError('You must specify a LIMIT clause which defines the number of pages to crawl') - if limit < 0: - limit = 0 + limit = query.limit.value config = Config() is_cloud = config.get("cloud", False) @@ -66,6 +64,9 @@ def select(self, query: ast.Select) -> pd.DataFrame: return result def get_columns(self): + """ + Returns the columns of the crawler table + """ return [ 'url', 'text_content', @@ -74,11 +75,9 @@ def get_columns(self): class WebHandler(APIHandler): - """A class for handling crawling content from websites. - - Attributes: """ - + Web handler, handling crawling content from websites. + """ def __init__(self, name=None, **kwargs): super().__init__(name) crawler = CrawlerTable(self) diff --git a/tests/unit/handlers/test_web.py b/tests/unit/handlers/test_web.py new file mode 100644 index 00000000000..9f724639d8a --- /dev/null +++ b/tests/unit/handlers/test_web.py @@ -0,0 +1,130 @@ +import unittest +from mindsdb.integrations.libs.api_handler_exceptions import TableAlreadyExists +from mindsdb.integrations.handlers.web_handler.web_handler import WebHandler +from mindsdb.integrations.handlers.web_handler.web_handler import CrawlerTable +from mindsdb.integrations.handlers.web_handler import urlcrawl_helpers as helpers +from unittest.mock import patch, MagicMock +import concurrent.futures +import pytest +from bs4 import BeautifulSoup +import pandas as pd + + +class TestWebsHandler(unittest.TestCase): + + def setUp(self) -> None: + self.handler = WebHandler(name='test_web_handler') + + def test_crawler_already_registered(self): + with self.assertRaises(TableAlreadyExists): + self.handler._register_table('crawler', CrawlerTable) + + +PDF_CONTENT = ( + b"%PDF-1.7\n\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n\n2 0 obj\n<< /Type /Pages " + b"/Kids [3 0 R] /Count 1 >>\nendobj\n\n3 0 obj\n<< /Type /Page /Parent 2 0 R /Contents 4 0 R " + b">>\nendobj\n\n4 0 obj\n<< /Length 22 >>\nstream\nBT\n/Helvetica 12 Tf\n1 0 0 1 50 700 Tm\n(" + b"Hello, this is a test!) Tj\nET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 " + b"f\n0000000010 00000 n\n0000000077 00000 n\n0000000122 00000 n\n0000000203 00000 n\n0000000277 " + b"00000 n\ntrailer\n<< /Size 5 /Root 1 0 R >>\nstartxref\n343\n%%EOF\n " +) + +BROKEN_PDF_CONTENT = b"%PDF-1.4\n\nThis is not a valid PDF file content\n" + +HTML_SAMPLE_1 = "

Heading One

Heading Two

" + +MARKDOWN_SAMPLE_1 = "# Heading One \n\n ## Heading Two" + + +class TestPDFToMarkdownTest: + @patch("requests.Response") + def test_pdf_to_markdown(self, mock_response) -> None: + response = mock_response.return_value + response.content = PDF_CONTENT + result = helpers.pdf_to_markdown(response) + assert "Hello, this is a test!" in result + + @patch("requests.Response") + def test_broken_pdf_to_markdown(self, mock_response) -> None: + response = mock_response.return_value + response.content = BROKEN_PDF_CONTENT + + with pytest.raises(Exception, match='Failed to process PDF data: cannot open broken document'): + helpers.pdf_to_markdown(response) + + +@pytest.mark.parametrize( + "url, result", + [ + ("google", False), + ("google.com", False), + ("https://google.com", True), + ("", False), + ], +) +def test_url_validation(url: str, result: bool) -> None: + assert helpers.is_valid(url) == result + + +@pytest.mark.parametrize( + "html, markdown", + [(HTML_SAMPLE_1, MARKDOWN_SAMPLE_1)], +) +def test_get_readable_text_from_soup(html: str, markdown: str) -> None: + soup = BeautifulSoup(html, "html.parser") + import re + expected = re.sub(r'\s+', ' ', markdown).strip() + actual = re.sub(r'\s+', ' ', helpers.get_readable_text_from_soup(soup)).strip() + + assert expected == actual + + +@patch("mindsdb.integrations.handlers.web_handler.urlcrawl_helpers.get_all_website_links") +@patch("concurrent.futures.ProcessPoolExecutor") +def test_parallel_get_all_website_links(mock_executor, mock_get_links): + # Setup: Mock the get_all_website_links function to return a list of links + mock_get_links.return_value = ["link1", "link2", "link3"] + + # Setup: Mock the ProcessPoolExecutor class to return a mock executor + mock_executor_instance = MagicMock() + mock_executor.return_value.__enter__.return_value = mock_executor_instance + + # Setup: Mock the executor to return a future that immediately completes with a result + mock_future = concurrent.futures.Future() + mock_future.set_result(["link1", "link2", "link3"]) + mock_executor_instance.submit.return_value = mock_future + + # Call the function with a list of URLs + urls = ["url1", "url2", "url3"] + result = helpers.parallel_get_all_website_links(urls) + + # Assert: Check if the function returns the expected result + expected = { + "url1": ["link1", "link2", "link3"], + "url2": ["link1", "link2", "link3"], + "url3": ["link1", "link2", "link3"], + } + assert result == expected + + # Assert: Check if the mocks were called as expected + mock_get_links.assert_called() + + +def test_dict_to_dataframe(): + # Setup: Create a dictionary of dictionaries + data = { + "row1": {"column1": 1, "column2": 2, "column3": 3}, + "row2": {"column1": 4, "column2": 5, "column3": 6}, + "row3": {"column1": 7, "column2": 8, "column3": 9}, + } + + # Call the function with the data, ignoring "column2" and setting the index name to "ID" + df = helpers.dict_to_dataframe(data, columns_to_ignore=["column2"], index_name="ID") + + # Assert: Check if the DataFrame has the expected structure + expected = pd.DataFrame({ + "column1": {"row1": 1, "row2": 4, "row3": 7}, + "column3": {"row1": 3, "row2": 6, "row3": 9}, + }) + expected.index.name = "ID" + pd.testing.assert_frame_equal(df, expected) From 97caf4bf26e01f7f8211f8ea0eadbfb75df877b6 Mon Sep 17 00:00:00 2001 From: ZoranPandovski Date: Fri, 10 May 2024 11:58:44 +0200 Subject: [PATCH 08/11] Update test --- tests/unit/handlers/test_web.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/handlers/test_web.py b/tests/unit/handlers/test_web.py index 9f724639d8a..d91670d0030 100644 --- a/tests/unit/handlers/test_web.py +++ b/tests/unit/handlers/test_web.py @@ -49,7 +49,7 @@ def test_broken_pdf_to_markdown(self, mock_response) -> None: response = mock_response.return_value response.content = BROKEN_PDF_CONTENT - with pytest.raises(Exception, match='Failed to process PDF data: cannot open broken document'): + with pytest.raises(Exception, match="Failed to process PDF data"): helpers.pdf_to_markdown(response) From c3f9b97aa77e5ae698ae8a16d95ec3594440e838 Mon Sep 17 00:00:00 2001 From: ZoranPandovski Date: Tue, 14 May 2024 12:02:31 +0200 Subject: [PATCH 09/11] Review changes --- docs/integrations/app-integrations/web-crawler.mdx | 4 ++-- mindsdb/integrations/handlers/web_handler/README.md | 4 ++-- .../integrations/handlers/web_handler/web_handler.py | 12 ++++++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/docs/integrations/app-integrations/web-crawler.mdx b/docs/integrations/app-integrations/web-crawler.mdx index cf24de406db..b744f9d08ca 100644 --- a/docs/integrations/app-integrations/web-crawler.mdx +++ b/docs/integrations/app-integrations/web-crawler.mdx @@ -25,13 +25,13 @@ CREATE DATABASE my_web WITH ENGINE = 'web'; ``` -The above query creates a database called `my_web`. This database by default have a table called `crawler` that we can use to crawl data from a given url/urls. +The above query creates a database called `my_web`. This database by default has a table called `crawler` that we can use to crawl data from a given url/urls. ## Usage -Specifying a query `LIMIT` is required. To crawl all pages on a site, consider setting the limit to a high value, such as 10,000, which exceeds the expected number of pages. Be aware that setting a higher limit may result in longer response times. +Specifying a `LIMIT` clause is required. To crawl all pages on a site, consider setting the limit to a high value, such as 10,000, which exceeds the expected number of pages. Be aware that setting a higher limit may result in longer response times. ### Get Websites Content diff --git a/mindsdb/integrations/handlers/web_handler/README.md b/mindsdb/integrations/handlers/web_handler/README.md index cf24de406db..b744f9d08ca 100644 --- a/mindsdb/integrations/handlers/web_handler/README.md +++ b/mindsdb/integrations/handlers/web_handler/README.md @@ -25,13 +25,13 @@ CREATE DATABASE my_web WITH ENGINE = 'web'; ``` -The above query creates a database called `my_web`. This database by default have a table called `crawler` that we can use to crawl data from a given url/urls. +The above query creates a database called `my_web`. This database by default has a table called `crawler` that we can use to crawl data from a given url/urls. ## Usage -Specifying a query `LIMIT` is required. To crawl all pages on a site, consider setting the limit to a high value, such as 10,000, which exceeds the expected number of pages. Be aware that setting a higher limit may result in longer response times. +Specifying a `LIMIT` clause is required. To crawl all pages on a site, consider setting the limit to a high value, such as 10,000, which exceeds the expected number of pages. Be aware that setting a higher limit may result in longer response times. ### Get Websites Content diff --git a/mindsdb/integrations/handlers/web_handler/web_handler.py b/mindsdb/integrations/handlers/web_handler/web_handler.py index e1613cd502e..42a42bf500e 100644 --- a/mindsdb/integrations/handlers/web_handler/web_handler.py +++ b/mindsdb/integrations/handlers/web_handler/web_handler.py @@ -1,5 +1,6 @@ import pandas as pd +from mindsdb.integrations.libs.response import HandlerStatusResponse from mindsdb_sql.parser import ast from mindsdb.integrations.libs.api_handler import APIHandler, APITable @@ -82,3 +83,14 @@ def __init__(self, name=None, **kwargs): super().__init__(name) crawler = CrawlerTable(self) self._register_table('crawler', crawler) + + def check_connection(self) -> HandlerStatusResponse: + """ + Checks the connection to the web handler + @TODO: Implement a better check for the connection + + Returns: + HandlerStatusResponse: Response containing the status of the connection. Hardcoded to True for now. + """ + response = HandlerStatusResponse(True) + return response \ No newline at end of file From 19bb55bcf9f327d1a4326f6fc653601daeed2985 Mon Sep 17 00:00:00 2001 From: ZoranPandovski Date: Tue, 14 May 2024 14:05:44 +0200 Subject: [PATCH 10/11] Lint --- mindsdb/integrations/handlers/web_handler/web_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindsdb/integrations/handlers/web_handler/web_handler.py b/mindsdb/integrations/handlers/web_handler/web_handler.py index 42a42bf500e..469d3723f95 100644 --- a/mindsdb/integrations/handlers/web_handler/web_handler.py +++ b/mindsdb/integrations/handlers/web_handler/web_handler.py @@ -93,4 +93,4 @@ def check_connection(self) -> HandlerStatusResponse: HandlerStatusResponse: Response containing the status of the connection. Hardcoded to True for now. """ response = HandlerStatusResponse(True) - return response \ No newline at end of file + return response From f999bf50c629f091284f68ee46ddf5e673635ee7 Mon Sep 17 00:00:00 2001 From: ZoranPandovski Date: Tue, 14 May 2024 15:40:20 +0200 Subject: [PATCH 11/11] Add more tests for the SELECT validation --- tests/unit/handlers/test_web.py | 180 ++++++++++++++++++-------------- 1 file changed, 102 insertions(+), 78 deletions(-) diff --git a/tests/unit/handlers/test_web.py b/tests/unit/handlers/test_web.py index d91670d0030..2c93147559a 100644 --- a/tests/unit/handlers/test_web.py +++ b/tests/unit/handlers/test_web.py @@ -7,7 +7,6 @@ import concurrent.futures import pytest from bs4 import BeautifulSoup -import pandas as pd class TestWebsHandler(unittest.TestCase): @@ -36,7 +35,7 @@ def test_crawler_already_registered(self): MARKDOWN_SAMPLE_1 = "# Heading One \n\n ## Heading Two" -class TestPDFToMarkdownTest: +class TestWebHelpers(unittest.TestCase): @patch("requests.Response") def test_pdf_to_markdown(self, mock_response) -> None: response = mock_response.return_value @@ -52,79 +51,104 @@ def test_broken_pdf_to_markdown(self, mock_response) -> None: with pytest.raises(Exception, match="Failed to process PDF data"): helpers.pdf_to_markdown(response) - -@pytest.mark.parametrize( - "url, result", - [ - ("google", False), - ("google.com", False), - ("https://google.com", True), - ("", False), - ], -) -def test_url_validation(url: str, result: bool) -> None: - assert helpers.is_valid(url) == result - - -@pytest.mark.parametrize( - "html, markdown", - [(HTML_SAMPLE_1, MARKDOWN_SAMPLE_1)], -) -def test_get_readable_text_from_soup(html: str, markdown: str) -> None: - soup = BeautifulSoup(html, "html.parser") - import re - expected = re.sub(r'\s+', ' ', markdown).strip() - actual = re.sub(r'\s+', ' ', helpers.get_readable_text_from_soup(soup)).strip() - - assert expected == actual - - -@patch("mindsdb.integrations.handlers.web_handler.urlcrawl_helpers.get_all_website_links") -@patch("concurrent.futures.ProcessPoolExecutor") -def test_parallel_get_all_website_links(mock_executor, mock_get_links): - # Setup: Mock the get_all_website_links function to return a list of links - mock_get_links.return_value = ["link1", "link2", "link3"] - - # Setup: Mock the ProcessPoolExecutor class to return a mock executor - mock_executor_instance = MagicMock() - mock_executor.return_value.__enter__.return_value = mock_executor_instance - - # Setup: Mock the executor to return a future that immediately completes with a result - mock_future = concurrent.futures.Future() - mock_future.set_result(["link1", "link2", "link3"]) - mock_executor_instance.submit.return_value = mock_future - - # Call the function with a list of URLs - urls = ["url1", "url2", "url3"] - result = helpers.parallel_get_all_website_links(urls) - - # Assert: Check if the function returns the expected result - expected = { - "url1": ["link1", "link2", "link3"], - "url2": ["link1", "link2", "link3"], - "url3": ["link1", "link2", "link3"], - } - assert result == expected - - # Assert: Check if the mocks were called as expected - mock_get_links.assert_called() - - -def test_dict_to_dataframe(): - # Setup: Create a dictionary of dictionaries - data = { - "row1": {"column1": 1, "column2": 2, "column3": 3}, - "row2": {"column1": 4, "column2": 5, "column3": 6}, - "row3": {"column1": 7, "column2": 8, "column3": 9}, - } - - # Call the function with the data, ignoring "column2" and setting the index name to "ID" - df = helpers.dict_to_dataframe(data, columns_to_ignore=["column2"], index_name="ID") - - # Assert: Check if the DataFrame has the expected structure - expected = pd.DataFrame({ - "column1": {"row1": 1, "row2": 4, "row3": 7}, - "column3": {"row1": 3, "row2": 6, "row3": 9}, - }) - expected.index.name = "ID" - pd.testing.assert_frame_equal(df, expected) + def test_url_validation(self): + assert helpers.is_valid('https://google.com') is True + assert helpers.is_valid('google.com') is False + + def test_get_readable_text_from_soup(self) -> None: + soup = BeautifulSoup(HTML_SAMPLE_1, "html.parser") + import re + expected = re.sub(r'\s+', ' ', MARKDOWN_SAMPLE_1).strip() + actual = re.sub(r'\s+', ' ', helpers.get_readable_text_from_soup(soup)).strip() + + assert expected == actual + + @patch("mindsdb.integrations.handlers.web_handler.urlcrawl_helpers.get_all_website_links") + @patch("concurrent.futures.ProcessPoolExecutor") + def test_parallel_get_all_website_links(self, mock_executor, mock_get_links): + # Setup: Mock the get_all_website_links function to return a list of links + mock_get_links.return_value = ["link1", "link2", "link3"] + + # Setup: Mock the ProcessPoolExecutor class to return a mock executor + mock_executor_instance = MagicMock() + mock_executor.return_value.__enter__.return_value = mock_executor_instance + + # Setup: Mock the executor to return a future that immediately completes with a result + mock_future = concurrent.futures.Future() + mock_future.set_result(["link1", "link2", "link3"]) + mock_executor_instance.submit.return_value = mock_future + + # Call the function with a list of URLs + urls = ["url1", "url2", "url3"] + result = helpers.parallel_get_all_website_links(urls) + + # Assert: Check if the function returns the expected result + expected = { + "url1": ["link1", "link2", "link3"], + "url2": ["link1", "link2", "link3"], + "url3": ["link1", "link2", "link3"], + } + assert result == expected + + # Assert: Check if the mocks were called as expected + mock_get_links.assert_called() + + +class TestWebHandler(unittest.TestCase): + + @patch('mindsdb.integrations.handlers.web_handler.web_handler.extract_comparison_conditions') + def test_select_with_or_operator_raise_error(self, mock_extract_comparison_conditions): + mock_extract_comparison_conditions.return_value = [('or', 'url', 'example.com')] + + crawler_table = CrawlerTable(handler=MagicMock()) + mock_query = MagicMock() + mock_ast = MagicMock() + mock_ast.get_type.return_value = 'OR' + + mock_query.where = mock_ast + with self.assertRaises(NotImplementedError) as context: + crawler_table.select(mock_query) + self.assertTrue('OR is not supported' in str(context.exception)) + + @patch('mindsdb.integrations.handlers.web_handler.web_handler.extract_comparison_conditions') + def test_select_with_invalid_url_format(self, mock_extract_comparison_conditions): + mock_extract_comparison_conditions.return_value = [('WHERE', 'url', 'example.com')] + + crawler_table = CrawlerTable(handler=MagicMock()) + mock_query = MagicMock() + mock_ast = MagicMock() + mock_ast.get_type.return_value = 'WHERE URL ("example.com")' + + mock_query.where = mock_ast + with self.assertRaises(NotImplementedError) as context: + crawler_table.select(mock_query) + self.assertTrue('Invalid URL format.' in str(context.exception)) + + @patch('mindsdb.integrations.handlers.web_handler.web_handler.extract_comparison_conditions') + def test_select_with_missing_url_(self, mock_extract_comparison_conditions): + mock_extract_comparison_conditions.return_value = [('WHERE', 'id', '1')] + + crawler_table = CrawlerTable(handler=MagicMock()) + mock_query = MagicMock() + mock_ast = MagicMock() + mock_ast.get_type.return_value = 'WHERE ID ("1")' + + mock_query.where = mock_ast + with self.assertRaises(NotImplementedError) as context: + crawler_table.select(mock_query) + self.assertTrue('You must specify what url you want to craw' in str(context.exception)) + + @patch('mindsdb.integrations.handlers.web_handler.web_handler.extract_comparison_conditions') + def test_select_with_missing_limit(self, mock_extract_comparison_conditions): + mock_extract_comparison_conditions.return_value = [('=', 'url', 'https://docs.mindsdb.com')] + + crawler_table = CrawlerTable(handler=MagicMock()) + mock_query = MagicMock() + mock_ast = MagicMock() + mock_ast.get_type.return_value = 'URL = "https://docs.mindsdb.com"' + + mock_query.where = mock_ast + mock_query.limit = None + with self.assertRaises(NotImplementedError) as context: + crawler_table.select(mock_query) + self.assertTrue('You must specify a LIMIT clause' in str(context.exception))