From b75e261dfb8cff8bc2634912236330b082341a29 Mon Sep 17 00:00:00 2001
From: ZoranPandovski <zoran.pandovski@gmail.com>
Date: Wed, 8 May 2024 16:58:14 +0200
Subject: [PATCH 01/11] Initial lint and cleanup

---
 .flake8                                       |  2 -
 .../handlers/web_handler/__about__.py         |  2 +-
 .../handlers/web_handler/urlcrawl_helpers.py  | 83 +------------------
 .../handlers/web_handler/web_handler.py       | 35 ++------
 4 files changed, 11 insertions(+), 111 deletions(-)

diff --git a/.flake8 b/.flake8
index 0171ea7e66d..cf360dcde2f 100644
--- a/.flake8
+++ b/.flake8
@@ -89,11 +89,9 @@ exclude =
   mindsdb/integrations/handlers/quickbooks_handler/*
   mindsdb/integrations/handlers/strava_handler/*
   mindsdb/integrations/handlers/strava_handler/*
-  mindsdb/integrations/handlers/web_handler/*
   mindsdb/integrations/handlers/strava_handler/*
   mindsdb/integrations/handlers/github_handler/*
   mindsdb/integrations/handlers/vitess_handler/*
-  mindsdb/integrations/handlers/web_handler/*
   mindsdb/integrations/handlers/impala_handler/*
   mindsdb/integrations/handlers/tdengine_handler/*
   mindsdb/integrations/handlers/huggingface_api_handler/*
diff --git a/mindsdb/integrations/handlers/web_handler/__about__.py b/mindsdb/integrations/handlers/web_handler/__about__.py
index 9600d54c970..12fdd95dcaf 100644
--- a/mindsdb/integrations/handlers/web_handler/__about__.py
+++ b/mindsdb/integrations/handlers/web_handler/__about__.py
@@ -6,4 +6,4 @@
 __github__ = 'https://github.com/mindsdb/mindsdb'
 __pypi__ = 'https://pypi.org/project/mindsdb/'
 __license__ = 'MIT'
-__copyright__ = 'Copyright 2022- mindsdb'
+__copyright__ = 'Copyright 2022 - MindsDB'
diff --git a/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py b/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py
index d839711a6ca..0f8a2024a94 100644
--- a/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py
+++ b/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py
@@ -1,6 +1,5 @@
 import concurrent.futures
 import io
-import re
 import traceback
 from threading import Lock
 from urllib.parse import urljoin, urlparse
@@ -13,22 +12,19 @@
 
 logger = log.getLogger(__name__)
 
+
 def pdf_to_markdown(response):
-    # Download the PDF from the given URL
 
     file_stream = io.BytesIO(response.content)
 
-    # Open the PDF from the in-memory file
     document = fitz.open(stream=file_stream, filetype="pdf")
 
     markdown_text = ""
     for page_num in range(len(document)):
         page = document.load_page(page_num)
 
-        # Get the blocks of text
         blocks = page.get_text("blocks")
 
-        # Sort the blocks by their vertical position on the page
         blocks.sort(key=lambda block: (block[1], block[0]))
 
         previous_block_bottom = 0
@@ -46,21 +42,15 @@ def pdf_to_markdown(response):
 
         markdown_text += "\n"
 
-    # Close the document
     document.close()
 
     return markdown_text
 
-
-url_list_lock = Lock()
-
-
 def is_valid(url):
     parsed = urlparse(url)
     return bool(parsed.netloc) and bool(parsed.scheme)
 
 
-# this bad boy gets all the crawling done in parallel
 def parallel_get_all_website_links(urls):
     url_contents = {}
 
@@ -79,18 +69,16 @@ def parallel_get_all_website_links(urls):
                 url_contents[url] = future.result()
             except Exception as exc:
                 logger.error(f'{url} generated an exception: {exc}')
-   
+
     return url_contents
 
 
-# this crawls one individual website
 def get_all_website_links(url):
     logger.info("crawling: {url} ...".format(url=url))
     urls = set()
 
     domain_name = urlparse(url).netloc
     try:
-        # Create a session to handle cookies
         session = requests.Session()
 
         # Add headers to mimic a real browser request
@@ -98,9 +86,7 @@ def get_all_website_links(url):
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
         }
 
-        # Send GET request
         response = session.get(url, headers=headers)
-        # Accept cookies if necessary
         if "cookie" in response.request.headers:
             session.cookies.update(response.cookies)
 
@@ -156,10 +142,8 @@ def get_all_website_links(url):
 
 
 def get_readable_text_from_soup(soup):
-    # Start formatting as Markdown
     markdown_output = ""
 
-    # Iterate through headings and paragraphs
     for tag in soup.find_all(
         ["h1", "h2", "h3", "h4", "h5", "h6", "p", "a", "ul", "ol", "li"]
     ):
@@ -183,14 +167,12 @@ def get_readable_text_from_soup(soup):
     return markdown_output
 
 
-# this bad girl does the recursive crawling of the websites
 def get_all_website_links_rec(url, reviewd_urls, limit=None):
     if limit is not None:
         if len(reviewd_urls) >= limit:
             return reviewd_urls
 
     if url not in reviewd_urls:
-        # if something happens getting the website links for this url then log the error
         try:
             reviewd_urls[url] = get_all_website_links(url)
         except Exception as e:
@@ -213,6 +195,7 @@ def get_all_website_links_rec(url, reviewd_urls, limit=None):
             continue
 
         # insert immediately to count limit between threads. fill later
+        url_list_lock = Lock()
         with url_list_lock:
             if limit is None or len(reviewd_urls) < limit:
                 reviewd_urls[new_url] = {}
@@ -220,7 +203,6 @@ def get_all_website_links_rec(url, reviewd_urls, limit=None):
             else:
                 break
 
-    # if there is something to fetch, go fetch
     if len(to_rev_url_list) > 0:
         new_revised_urls = parallel_get_all_website_links(to_rev_url_list)
 
@@ -230,21 +212,9 @@ def get_all_website_links_rec(url, reviewd_urls, limit=None):
             get_all_website_links_rec(new_url, reviewd_urls, limit)
 
 
-# this crawls the websites and returns it all as a dataframe, ready to be served
 def get_all_websites(urls, limit=1, html=False):
     reviewd_urls = {}
 
-    # def fetch_url(url):
-    #     url = url.rstrip('/')
-    #     if urlparse(url).scheme == "":
-    #         # Try HTTPS first
-    #         url = "https://" + url
-    #     reviewd_urls_iter = {}
-    #     get_all_website_links_rec(url, reviewd_urls_iter, limit)
-    #     return reviewd_urls_iter
-
-    # reviewd_urls = fetch_url(urls[0])
-    # Define a helper function that will be run in parallel.
     def fetch_url(url):
         # Allow URLs to be passed wrapped in quotation marks so they can be used
         # directly from the SQL editor.
@@ -256,7 +226,6 @@ def fetch_url(url):
             url = "https://" + url
         get_all_website_links_rec(url, reviewd_urls, limit)
 
-    # Use a ThreadPoolExecutor to run the helper function in parallel.
     with concurrent.futures.ThreadPoolExecutor() as executor:
         future_to_url = {executor.submit(fetch_url, url): url for url in urls}
 
@@ -269,62 +238,18 @@ def fetch_url(url):
     df = dict_to_dataframe(
         reviewd_urls, columns_to_ignore=columns_to_ignore, index_name="url"
     )
-
+    print('get_all_websites', df)
     if not df.empty and df[df.error.isna()].empty:
-        # no real data - rise exception from first row
         raise Exception(str(df.iloc[0].error))
     return df
 
 
-# this can parse the native query
-def parse_urls_limit(input_str):
-    # Split the input string into 'url', 'limit' or 'html' parts
-    items = re.split(r",", input_str)
-
-    # Initialize list for urls, limit and html
-    urls = []
-    limit = None
-    html = False
-
-    for item in items:
-        item = item.strip()  # Remove leading/trailing whitespace
-
-        # Check if item is a 'limit' or 'html' setting
-        if item.lower().startswith("limit"):
-            limit_match = re.search(r"\d+", item)
-            if limit_match:
-                limit = int(limit_match.group())  # Extract the number
-        elif item.lower().startswith("html"):
-            html_match = re.search(r"(true|false)", item, re.I)
-            if html_match:
-                html = (
-                    html_match.group().lower() == "true"
-                )  # Check if the value is 'true'
-        else:
-            urls.append(item)  # Add the item to the url list
-
-    return {"urls": urls, "limit": limit, "html": html}
-
-
-# run a query that goes and crawls urls
-# format url, url, ..., limit=n
-# you can pass one of many urls, limit is optional
-def get_df_from_query_str(query_str):
-    args = parse_urls_limit(query_str)
-    df = get_all_websites(args["urls"], args["limit"], args["html"])
-    return df
-
-
-# this flips a dictionary of dictionaries into a dataframe so we can use it in mindsdb
 def dict_to_dataframe(dict_of_dicts, columns_to_ignore=None, index_name=None):
-    # Convert dictionary of dictionaries into DataFrame
     df = pd.DataFrame.from_dict(dict_of_dicts, orient="index")
 
-    # If columns_to_ignore is provided, drop these columns
     if columns_to_ignore:
         df = df.drop(columns_to_ignore, axis=1, errors="ignore")
 
-    # If index_name is provided, rename the index
     if index_name:
         df.index.name = index_name
 
diff --git a/mindsdb/integrations/handlers/web_handler/web_handler.py b/mindsdb/integrations/handlers/web_handler/web_handler.py
index 43faef7236a..1cf01b897a7 100644
--- a/mindsdb/integrations/handlers/web_handler/web_handler.py
+++ b/mindsdb/integrations/handlers/web_handler/web_handler.py
@@ -13,7 +13,7 @@
 from mindsdb.utilities.security import is_private_url
 from mindsdb.utilities.config import Config
 
-from .urlcrawl_helpers import get_df_from_query_str, get_all_websites
+from .urlcrawl_helpers import get_all_websites
 
 
 class CrawlerTable(APITable):
@@ -25,7 +25,7 @@ def select(self, query: ast.Select) -> pd.DataFrame:
         for op, arg1, arg2 in conditions:
 
             if op == 'or':
-                raise NotImplementedError(f'OR is not supported')
+                raise NotImplementedError('OR is not supported')
 
             if arg1 == 'url':
                 url = arg2
@@ -38,20 +38,17 @@ def select(self, query: ast.Select) -> pd.DataFrame:
                     else:
                         urls = url
                 else:
-                    raise NotImplementedError(
-                        f'url can be url = "someurl", you can also crawl multiple sites, as follows:'
-                        f' url IN ("url1", "url2", ..)'
-                    )
-
+                    raise NotImplementedError('Invalid URL format. Please provide a single URL like url = "example.com" or'
+                                              'multiple URLs using the format url IN ("url1", "url2", ...)')
             else:
                 pass
 
         if len(urls) == 0:
             raise NotImplementedError(
-                f'You must specify what url you want to crawl, for example: SELECT * FROM crawl WHERE url IN ("someurl", ..)')
+                'You must specify what url you want to crawl, for example: SELECT * FROM crawl WHERE url = "someurl"')
 
         if query.limit is None:
-            raise NotImplementedError(f'You must specify a LIMIT which defines the number of pages to crawl')
+            raise NotImplementedError('You must specify a LIMIT which defines the number of pages to crawl')
         limit = query.limit.value
 
         if limit < 0:
@@ -85,29 +82,9 @@ class WebHandler(APIHandler):
     """A class for handling crawling content from websites.
 
     Attributes:
-        
     """
 
     def __init__(self, name=None, **kwargs):
         super().__init__(name)
-
-        self.api = None
-        self.is_connected = True
         crawler = CrawlerTable(self)
         self._register_table('crawler', crawler)
-
-    def check_connection(self) -> StatusResponse:
-
-        response = StatusResponse(False)
-        response.success = True
-
-        return response
-
-    def native_query(self, query_string: str = None):
-
-        df = get_df_from_query_str(query_string)
-
-        return Response(
-            RESPONSE_TYPE.TABLE,
-            data_frame=df
-        )

From 65cb6c7fa5f315987ce9b52b71b57a6bc81ed665 Mon Sep 17 00:00:00 2001
From: ZoranPandovski <zoran.pandovski@gmail.com>
Date: Wed, 8 May 2024 16:58:21 +0200
Subject: [PATCH 02/11] Improve README

---
 .../handlers/web_handler/README.md            | 91 ++++++++++++++-----
 1 file changed, 66 insertions(+), 25 deletions(-)

diff --git a/mindsdb/integrations/handlers/web_handler/README.md b/mindsdb/integrations/handlers/web_handler/README.md
index 60f1e188f82..cf24de406db 100644
--- a/mindsdb/integrations/handlers/web_handler/README.md
+++ b/mindsdb/integrations/handlers/web_handler/README.md
@@ -1,54 +1,95 @@
-# Build your Web crawler
+---
+title: Web Crawler
+sidebarTitle: Web Crawler
+---
 
-This integration allows you to query the results of a crawler in SQL:
+In this section, we present how to use a web crawler within MindsDB.
 
-- This can be particularly useful for building A/Q systems from data on a website.
+A web crawler is an automated script designed to systematically browse and index content on the internet. Within MindsDB, you can utilize a web crawler to efficiently collect data from various websites.
 
-Note that this crawler can crawl every single sub-site from the original.
+## Prerequisites
 
-Let's see in action
+Before proceeding, ensure the following prerequisites are met:
+
+1. Install MindsDB locally via [Docker](/setup/self-hosted/docker) or [Docker Desktop](/setup/self-hosted/docker-desktop).
+2. To use Web Crawler with MindsDB, install the required dependencies following [this instruction](/setup/self-hosted/docker#install-dependencies).
+
+## Connection
+
+This handler does not require any connection parameters.
+
+Here is how to initialize a web crawler:
 
 ```sql
--- Should be able to create a web crawler database
 CREATE DATABASE my_web 
-With 
-    ENGINE = 'web';
+WITH ENGINE = 'web';
 ```
+<Tip>
+The above query creates a database called `my_web`. This database by default have a table called `crawler` that we can use to crawl data from a given url/urls.
+</Tip>
 
-This creates a database called my_web. This database ships with a table called crawler that we can use to crawl data given some url/urls.
+## Usage
 
+<Note>
+Specifying a query `LIMIT` is required. To crawl all pages on a site, consider setting the limit to a high value, such as 10,000, which exceeds the expected number of pages. Be aware that setting a higher limit may result in longer response times.
+</Note>
 
-## Searching for web content in SQL
+### Get Websites Content
 
-Let's get the content of a docs.mindsdb.com website:
+The following usage examples demonstrate how to retrieve content from `docs.mindsdb.com`:
 
 ```sql
-SELECT 
-   * 
+SELECT * 
 FROM my_web.crawler 
-WHERE 
-   url = 'docs.mindsdb.com' 
+WHERE url = 'docs.mindsdb.com' 
 LIMIT 1;
 ```
 
+You can also retrieve content from internal pages. The following query fetches the content from 10 internal pages:
+
+```sql
+SELECT * 
+FROM my_web.crawler 
+WHERE url = 'docs.mindsdb.com' 
+LIMIT 10;
+```
+
+Another option is to get the content from multiple websites by using the `IN ()` operator:
 
-This should return the contents of docs.mindsdb.com.
+```sql
+SELECT * 
+FROM my_web.crawler 
+WHERE url IN ('docs.mindsdb.com', 'docs.python.org') 
+LIMIT 1;
+```
 
+### Get PDF Content
 
-Now, let's assume we want to search for the content on multiple websites.
+MindsDB accepts [file uploads](/sql/create/file) of `csv`, `xlsx`, `xls`, `sheet`, `json`, and `parquet`. However, you can also configure the web crawler to fetch data from PDF files accessible via URLs.
 
 ```sql
-SELECT 
-   * 
+SELECT * 
 FROM my_web.crawler 
-WHERE 
-   url IN ('docs.mindsdb.com', 'docs.python.org') 
-LIMIT 30;
+WHERE url = '<link-to-pdf-file>' 
+LIMIT 1;
 ```
 
-This command will crawl two sites and stop when the results count hits 30. The total count of rows in the result will be 30.
+## Troubleshooting
+
+<Warning>
+`Web crawler encounters character encoding issues`
+
+* **Symptoms**: Extracted text appears garbled or contains strange characters instead of the expected text.
+* **Checklist**:
+      1. Open a GitHub Issue: If you encounter a bug or a repeatable error with encoding, 
+      report it on the [MindsDB GitHub](https://github.com/mindsdb/mindsdb/issues) repository by opening an issue.
+</Warning>
 
-NOTE: limit is mandatory. If you want to crawl all pages on the site, you can pass a big number in the limit (for example, 10000), more than the expected count of pages on the site. 
-However, a big limit also increases the time waiting for a response.
 
+<Warning>
+`Web crawler times out while trying to fetch content`
 
+* **Symptoms**: The crawler fails to retrieve data from a website, resulting in timeout errors.
+* **Checklist**:
+      1. Check the network connection to ensure the target site is reachable.
+</Warning>
\ No newline at end of file

From 79a75fabea9f141b531f408da1f2ba29e14123cd Mon Sep 17 00:00:00 2001
From: ZoranPandovski <zoran.pandovski@gmail.com>
Date: Wed, 8 May 2024 16:58:27 +0200
Subject: [PATCH 03/11] Improve docs

---
 .../app-integrations/web-crawler.mdx          | 41 +++++++++++++++----
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/docs/integrations/app-integrations/web-crawler.mdx b/docs/integrations/app-integrations/web-crawler.mdx
index 0b02c181ce6..cf24de406db 100644
--- a/docs/integrations/app-integrations/web-crawler.mdx
+++ b/docs/integrations/app-integrations/web-crawler.mdx
@@ -5,16 +5,14 @@ sidebarTitle: Web Crawler
 
 In this section, we present how to use a web crawler within MindsDB.
 
-A web crawler is a computer program or automated script that browses the internet and navigates through websites, web pages, and web content to gather data. Within the realm of MindsDB, a web crawler can be employed to harvest data, which can be used to train models, 
-domain specific chatbots or fine-tune LLMs.
+A web crawler is an automated script designed to systematically browse and index content on the internet. Within MindsDB, you can utilize a web crawler to efficiently collect data from various websites.
 
 ## Prerequisites
 
 Before proceeding, ensure the following prerequisites are met:
 
 1. Install MindsDB locally via [Docker](/setup/self-hosted/docker) or [Docker Desktop](/setup/self-hosted/docker-desktop).
-2. To connect Web Crawler to MindsDB, install the required dependencies following [this instruction](/setup/self-hosted/docker#install-dependencies).
-3. Install or ensure access to Web Crawler.
+2. To use Web Crawler with MindsDB, install the required dependencies following [this instruction](/setup/self-hosted/docker#install-dependencies).
 
 ## Connection
 
@@ -26,12 +24,19 @@ Here is how to initialize a web crawler:
 CREATE DATABASE my_web 
 WITH ENGINE = 'web';
 ```
+<Tip>
+The above query creates a database called `my_web`. This database by default have a table called `crawler` that we can use to crawl data from a given url/urls.
+</Tip>
 
 ## Usage
 
+<Note>
+Specifying a query `LIMIT` is required. To crawl all pages on a site, consider setting the limit to a high value, such as 10,000, which exceeds the expected number of pages. Be aware that setting a higher limit may result in longer response times.
+</Note>
+
 ### Get Websites Content
 
-Here is how to get the content of `docs.mindsdb.com`:
+The following usage examples demonstrate how to retrieve content from `docs.mindsdb.com`:
 
 ```sql
 SELECT * 
@@ -40,7 +45,7 @@ WHERE url = 'docs.mindsdb.com'
 LIMIT 1;
 ```
 
-You can also get the content of internal pages. Here is how to fetch the content from 10 internal pages:
+You can also retrieve content from internal pages. The following query fetches the content from 10 internal pages:
 
 ```sql
 SELECT * 
@@ -49,7 +54,7 @@ WHERE url = 'docs.mindsdb.com'
 LIMIT 10;
 ```
 
-Another option is to get the content from multiple websites.
+Another option is to get the content from multiple websites by using the `IN ()` operator:
 
 ```sql
 SELECT * 
@@ -60,7 +65,7 @@ LIMIT 1;
 
 ### Get PDF Content
 
-MindsDB accepts [file uploads](/sql/create/file) of `csv`, `xlsx`, `xls`, `sheet`, `json`, and `parquet`. However, you can utilize the web crawler to fetch data from `pdf` files.
+MindsDB accepts [file uploads](/sql/create/file) of `csv`, `xlsx`, `xls`, `sheet`, `json`, and `parquet`. However, you can also configure the web crawler to fetch data from PDF files accessible via URLs.
 
 ```sql
 SELECT * 
@@ -69,4 +74,22 @@ WHERE url = '<link-to-pdf-file>'
 LIMIT 1;
 ```
 
-For example, you can provide a link to a `pdf` file stored in Amazon S3.
+## Troubleshooting
+
+<Warning>
+`Web crawler encounters character encoding issues`
+
+* **Symptoms**: Extracted text appears garbled or contains strange characters instead of the expected text.
+* **Checklist**:
+      1. Open a GitHub Issue: If you encounter a bug or a repeatable error with encoding, 
+      report it on the [MindsDB GitHub](https://github.com/mindsdb/mindsdb/issues) repository by opening an issue.
+</Warning>
+
+
+<Warning>
+`Web crawler times out while trying to fetch content`
+
+* **Symptoms**: The crawler fails to retrieve data from a website, resulting in timeout errors.
+* **Checklist**:
+      1. Check the network connection to ensure the target site is reachable.
+</Warning>
\ No newline at end of file

From 0a9fa1c86d75cdbb77c0b31d27b7721135d629fa Mon Sep 17 00:00:00 2001
From: ZoranPandovski <zoran.pandovski@gmail.com>
Date: Thu, 9 May 2024 13:20:16 +0200
Subject: [PATCH 04/11] Re write helper functions

---
 .../handlers/web_handler/urlcrawl_helpers.py  | 193 ++++++++++++------
 1 file changed, 128 insertions(+), 65 deletions(-)

diff --git a/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py b/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py
index 0f8a2024a94..fb67a0b9213 100644
--- a/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py
+++ b/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py
@@ -2,8 +2,9 @@
 import io
 import traceback
 from threading import Lock
-from urllib.parse import urljoin, urlparse
+from urllib.parse import urljoin, urlparse, urlunparse
 
+import html2text
 import fitz  # PyMuPDF
 import pandas as pd
 import requests
@@ -13,13 +14,28 @@
 logger = log.getLogger(__name__)
 
 
-def pdf_to_markdown(response):
+def pdf_to_markdown(response, gap_threshold=10):
+    """
+    Convert a PDF document to Markdown text.
 
-    file_stream = io.BytesIO(response.content)
+    Args:
+        response: the response object containing the PDF data
+        gap_threshold (int): the vertical gap size that triggers a new line in the output (default 10)
 
-    document = fitz.open(stream=file_stream, filetype="pdf")
+    Returns:
+        A string containing the converted Markdown text.
 
-    markdown_text = ""
+    Raises:
+        Exception -- if the PDF data cannot be processed.
+    """
+
+    try:
+        file_stream = io.BytesIO(response.content)
+        document = fitz.open(stream=file_stream, filetype="pdf")
+    except Exception as e:
+        raise Exception("Failed to process PDF data: " + str(e))
+
+    markdown_lines = []
     for page_num in range(len(document)):
         page = document.load_page(page_num)
 
@@ -34,24 +50,45 @@ def pdf_to_markdown(response):
             block_text = block[4]
 
             # Check if there's a large vertical gap between this block and the previous one
-            if y0 - previous_block_bottom > 10:
-                markdown_text += "\n"
+            if y0 - previous_block_bottom > gap_threshold:
+                markdown_lines.append("")
 
-            markdown_text += block_text + "\n"
+            markdown_lines.append(block_text)
             previous_block_bottom = y1
 
-        markdown_text += "\n"
+        markdown_lines.append("")
 
     document.close()
 
-    return markdown_text
+    return "\n".join(markdown_lines)
+
+def is_valid(url) -> bool:
+    """
+    Check if a URL is valid.
+
+    Args:
+        url: the URL to check
 
-def is_valid(url):
+    Returns:
+        bool: True if the URL is valid, False otherwise.
+    """
     parsed = urlparse(url)
     return bool(parsed.netloc) and bool(parsed.scheme)
 
 
-def parallel_get_all_website_links(urls):
+def parallel_get_all_website_links(urls) -> dict:
+    """
+    Fetch all website links from a list of URLs.
+
+    Args:
+        urls (list): a list of URLs to fetch links from
+
+    Returns:
+        A dictionary mapping each URL to a list of links found on that URL.
+
+    Raises:
+        Exception: if an error occurs while fetching links from a URL.
+    """
     url_contents = {}
 
     if len(urls) <= 10:
@@ -69,12 +106,22 @@ def parallel_get_all_website_links(urls):
                 url_contents[url] = future.result()
             except Exception as exc:
                 logger.error(f'{url} generated an exception: {exc}')
+                # don't raise the exception, just log it, continue processing other urls
 
     return url_contents
 
 
-def get_all_website_links(url):
-    logger.info("crawling: {url} ...".format(url=url))
+def get_all_website_links(url) -> dict:
+    """
+    Fetch all website links from a URL.
+
+    Args:
+        url (str): the URL to fetch links from
+
+    Returns:
+        A dictionary containing the URL, the extracted links, the HTML content, the text content, and any error that occurred.
+    """
+    logger.info("rawling: {url} ...".format(url=url))
     urls = set()
 
     domain_name = urlparse(url).netloc
@@ -107,10 +154,7 @@ def get_all_website_links(url):
                     continue
                 href = urljoin(url, href)
                 parsed_href = urlparse(href)
-
-                href = (
-                    parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
-                )
+                href = urlunparse((parsed_href.scheme, parsed_href.netloc, parsed_href.path, '', '', ''))
                 if not is_valid(href):
                     continue
                 if href in urls:
@@ -141,44 +185,42 @@ def get_all_website_links(url):
     }
 
 
-def get_readable_text_from_soup(soup):
-    markdown_output = ""
-
-    for tag in soup.find_all(
-        ["h1", "h2", "h3", "h4", "h5", "h6", "p", "a", "ul", "ol", "li"]
-    ):
-        if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
-            markdown_output += (
-                "#" * int(tag.name[1]) + " " + tag.get_text().strip() + "\n\n"
-            )
-        elif tag.name == "p":
-            markdown_output += tag.get_text().strip() + "\n\n"
-        elif tag.name == "a":
-            markdown_output += f"[{tag.get_text().strip()}]({tag.get('href')})\n\n"
-        elif tag.name == "ul":
-            for li in tag.find_all("li"):
-                markdown_output += f"* {li.get_text().strip()}\n"
-            markdown_output += "\n"
-        elif tag.name == "ol":
-            for index, li in enumerate(tag.find_all("li")):
-                markdown_output += f"{index + 1}. {li.get_text().strip()}\n"
-            markdown_output += "\n"
-
-    return markdown_output
-
-
-def get_all_website_links_rec(url, reviewd_urls, limit=None):
+def get_readable_text_from_soup(soup) -> str:
+    """
+    Extract readable text from a BeautifulSoup object and convert it to Markdown.
+
+    Args:
+        soup (BeautifulSoup): a BeautifulSoup object
+
+    Returns:
+        The extracted text in Markdown format.
+    """
+    html_converter = html2text.HTML2Text()
+    html_converter.ignore_links = False
+    return html_converter.handle(str(soup))
+
+def get_all_website_links_recursively(url, reviewed_urls, limit=None):
+    """
+    Recursively gathers all links from a given website up to a specified limit.
+
+    Args:
+        url (str): The starting URL to fetch links from.
+        reviewed_urls (dict): A dictionary to keep track of reviewed URLs and associated data.
+        limit (int, optional): The maximum number of URLs to process.
+
+    TODO: Refactor this function to use a iterative aproach instead of recursion 
+    """
     if limit is not None:
-        if len(reviewd_urls) >= limit:
-            return reviewd_urls
+        if len(reviewed_urls) >= limit:
+            return reviewed_urls
 
-    if url not in reviewd_urls:
+    if url not in reviewed_urls:
         try:
-            reviewd_urls[url] = get_all_website_links(url)
+            reviewed_urls[url] = get_all_website_links(url)
         except Exception as e:
             error_message = traceback.format_exc().splitlines()[-1]
             logger.error("An exception occurred: %s", str(e))
-            reviewd_urls[url] = {
+            reviewed_urls[url] = {
                 "url": url,
                 "urls": [],
                 "html_content": "",
@@ -189,16 +231,16 @@ def get_all_website_links_rec(url, reviewd_urls, limit=None):
     to_rev_url_list = []
 
     # create a list of new urls to review that don't exist in the already reviewed ones
-    for new_url in reviewd_urls[url]["urls"]:
+    for new_url in reviewed_urls[url]["urls"]:
         # if this is already in the urls, then no need to go and crawl for it
-        if new_url in reviewd_urls or new_url in to_rev_url_list:
+        if new_url in reviewed_urls or new_url in to_rev_url_list:
             continue
 
         # insert immediately to count limit between threads. fill later
         url_list_lock = Lock()
         with url_list_lock:
-            if limit is None or len(reviewd_urls) < limit:
-                reviewd_urls[new_url] = {}
+            if limit is None or len(reviewed_urls) < limit:
+                reviewed_urls[new_url] = {}
                 to_rev_url_list.append(new_url)
             else:
                 break
@@ -206,15 +248,24 @@ def get_all_website_links_rec(url, reviewd_urls, limit=None):
     if len(to_rev_url_list) > 0:
         new_revised_urls = parallel_get_all_website_links(to_rev_url_list)
 
-        reviewd_urls.update(new_revised_urls)
+        reviewed_urls.update(new_revised_urls)
 
         for new_url in new_revised_urls:
-            get_all_website_links_rec(new_url, reviewd_urls, limit)
+            get_all_website_links_recursively(new_url, reviewed_urls, limit)
+
 
+def get_all_websites(urls, limit=1, html=False) -> pd.DataFrame:
+    """
+    Crawl a list of websites and return a DataFrame containing the results.
 
-def get_all_websites(urls, limit=1, html=False):
-    reviewd_urls = {}
+    Args:
+        urls (list): a list of URLs to crawl
+        html (bool): a boolean indicating whether to include the HTML content in the results
 
+    Returns:
+        A DataFrame containing the results.
+    """
+    reviewed_urls = {}
     def fetch_url(url):
         # Allow URLs to be passed wrapped in quotation marks so they can be used
         # directly from the SQL editor.
@@ -224,8 +275,9 @@ def fetch_url(url):
         if urlparse(url).scheme == "":
             # Try HTTPS first
             url = "https://" + url
-        get_all_website_links_rec(url, reviewd_urls, limit)
+        get_all_website_links_recursively(url, reviewed_urls, limit)
 
+    # Use a ThreadPoolExecutor to run the helper function in parallel.
     with concurrent.futures.ThreadPoolExecutor() as executor:
         future_to_url = {executor.submit(fetch_url, url): url for url in urls}
 
@@ -236,21 +288,32 @@ def fetch_url(url):
     if html is False:
         columns_to_ignore += ["html_content"]
     df = dict_to_dataframe(
-        reviewd_urls, columns_to_ignore=columns_to_ignore, index_name="url"
+        reviewed_urls, columns_to_ignore=columns_to_ignore, index_name="url"
     )
-    print('get_all_websites', df)
+
     if not df.empty and df[df.error.isna()].empty:
         raise Exception(str(df.iloc[0].error))
     return df
 
-
-def dict_to_dataframe(dict_of_dicts, columns_to_ignore=None, index_name=None):
+def dict_to_dataframe(dict_of_dicts, columns_to_ignore=None, index_name=None) -> pd.DataFrame:
+    """
+    Convert a dictionary of dictionaries to a DataFrame.
+    
+    Args:
+        dict_of_dicts (dict): a dictionary of dictionaries
+        columns_to_ignore (list): a list of columns to ignore
+        index_name (str): the name of the index column
+    Returns:
+        A DataFrame containing the data.
+    """
     df = pd.DataFrame.from_dict(dict_of_dicts, orient="index")
 
     if columns_to_ignore:
-        df = df.drop(columns_to_ignore, axis=1, errors="ignore")
+        for column in columns_to_ignore:
+            if column in df.columns:
+                df = df.drop(column, axis=1)
 
     if index_name:
         df.index.name = index_name
 
-    return df
+    return df
\ No newline at end of file

From 88c49a396cfdc839b6a3188a73ac1395ddaa5fa9 Mon Sep 17 00:00:00 2001
From: ZoranPandovski <zoran.pandovski@gmail.com>
Date: Thu, 9 May 2024 13:27:31 +0200
Subject: [PATCH 05/11] Lint

---
 .../handlers/web_handler/urlcrawl_helpers.py           | 10 +++++++---
 .../integrations/handlers/web_handler/web_handler.py   |  5 -----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py b/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py
index fb67a0b9213..2d7d166cee7 100644
--- a/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py
+++ b/mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py
@@ -62,6 +62,7 @@ def pdf_to_markdown(response, gap_threshold=10):
 
     return "\n".join(markdown_lines)
 
+
 def is_valid(url) -> bool:
     """
     Check if a URL is valid.
@@ -199,6 +200,7 @@ def get_readable_text_from_soup(soup) -> str:
     html_converter.ignore_links = False
     return html_converter.handle(str(soup))
 
+
 def get_all_website_links_recursively(url, reviewed_urls, limit=None):
     """
     Recursively gathers all links from a given website up to a specified limit.
@@ -208,7 +210,7 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None):
         reviewed_urls (dict): A dictionary to keep track of reviewed URLs and associated data.
         limit (int, optional): The maximum number of URLs to process.
 
-    TODO: Refactor this function to use a iterative aproach instead of recursion 
+    TODO: Refactor this function to use a iterative aproach instead of recursion
     """
     if limit is not None:
         if len(reviewed_urls) >= limit:
@@ -266,6 +268,7 @@ def get_all_websites(urls, limit=1, html=False) -> pd.DataFrame:
         A DataFrame containing the results.
     """
     reviewed_urls = {}
+
     def fetch_url(url):
         # Allow URLs to be passed wrapped in quotation marks so they can be used
         # directly from the SQL editor.
@@ -295,10 +298,11 @@ def fetch_url(url):
         raise Exception(str(df.iloc[0].error))
     return df
 
+
 def dict_to_dataframe(dict_of_dicts, columns_to_ignore=None, index_name=None) -> pd.DataFrame:
     """
     Convert a dictionary of dictionaries to a DataFrame.
-    
+
     Args:
         dict_of_dicts (dict): a dictionary of dictionaries
         columns_to_ignore (list): a list of columns to ignore
@@ -316,4 +320,4 @@ def dict_to_dataframe(dict_of_dicts, columns_to_ignore=None, index_name=None) ->
     if index_name:
         df.index.name = index_name
 
-    return df
\ No newline at end of file
+    return df
diff --git a/mindsdb/integrations/handlers/web_handler/web_handler.py b/mindsdb/integrations/handlers/web_handler/web_handler.py
index 1cf01b897a7..885b377dcad 100644
--- a/mindsdb/integrations/handlers/web_handler/web_handler.py
+++ b/mindsdb/integrations/handlers/web_handler/web_handler.py
@@ -5,11 +5,6 @@
 from mindsdb.integrations.libs.api_handler import APIHandler, APITable
 from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions, project_dataframe
 
-from mindsdb.integrations.libs.response import (
-    HandlerStatusResponse as StatusResponse,
-    HandlerResponse as Response,
-    RESPONSE_TYPE
-)
 from mindsdb.utilities.security import is_private_url
 from mindsdb.utilities.config import Config
 

From a6d5a7edfc7fa82331ec2ff1bcbfbd8b462e2793 Mon Sep 17 00:00:00 2001
From: ZoranPandovski <zoran.pandovski@gmail.com>
Date: Thu, 9 May 2024 14:07:52 +0200
Subject: [PATCH 06/11] Add missing dependency

---
 mindsdb/integrations/handlers/web_handler/requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mindsdb/integrations/handlers/web_handler/requirements.txt b/mindsdb/integrations/handlers/web_handler/requirements.txt
index feefb08c354..e01687458b6 100644
--- a/mindsdb/integrations/handlers/web_handler/requirements.txt
+++ b/mindsdb/integrations/handlers/web_handler/requirements.txt
@@ -1,2 +1,3 @@
 bs4
-pymupdf
\ No newline at end of file
+pymupdf
+html2text
\ No newline at end of file

From 0ce014876810d3b0cb86bed542bc720266295239 Mon Sep 17 00:00:00 2001
From: ZoranPandovski <zoran.pandovski@gmail.com>
Date: Thu, 9 May 2024 16:39:52 +0200
Subject: [PATCH 07/11] Add unit tests for web handler

---
 .github/workflows/test_on_push.yml            |   3 +-
 .../handlers/web_handler/requirements.txt     |   1 -
 .../handlers/web_handler/tests/__init__.py    |   0
 .../web_handler/tests/example_data.py         |  18 ---
 .../web_handler/tests/test_helpers.py         |  49 -------
 .../handlers/web_handler/web_handler.py       |  45 +++---
 tests/unit/handlers/test_web.py               | 130 ++++++++++++++++++
 7 files changed, 154 insertions(+), 92 deletions(-)
 delete mode 100644 mindsdb/integrations/handlers/web_handler/tests/__init__.py
 delete mode 100644 mindsdb/integrations/handlers/web_handler/tests/example_data.py
 delete mode 100644 mindsdb/integrations/handlers/web_handler/tests/test_helpers.py
 create mode 100644 tests/unit/handlers/test_web.py

diff --git a/.github/workflows/test_on_push.yml b/.github/workflows/test_on_push.yml
index ba5c45719fd..8e3eb0df371 100644
--- a/.github/workflows/test_on_push.yml
+++ b/.github/workflows/test_on_push.yml
@@ -119,6 +119,7 @@ jobs:
         pip install mindsdb[mssql]
         pip install mindsdb[clickhouse]
         pip install mindsdb[snowflake]
+        pip install mindsdb[web]
         pip freeze
     - name: Run unit tests
       run: |
@@ -133,7 +134,7 @@ jobs:
         fi
     - name: Run Handlers tests and submit Coverage to coveralls
       run: |
-        handlers=("mysql" "postgres" "mssql" "clickhouse" "snowflake")
+        handlers=("mysql" "postgres" "mssql" "clickhouse" "snowflake" "web")
         for handler in "${handlers[@]}"
         do
           pytest --cov=mindsdb/integrations/handlers/${handler}_handler tests/unit/handlers/test_${handler}.py 
diff --git a/mindsdb/integrations/handlers/web_handler/requirements.txt b/mindsdb/integrations/handlers/web_handler/requirements.txt
index e01687458b6..062cdd061ad 100644
--- a/mindsdb/integrations/handlers/web_handler/requirements.txt
+++ b/mindsdb/integrations/handlers/web_handler/requirements.txt
@@ -1,3 +1,2 @@
-bs4
 pymupdf
 html2text
\ No newline at end of file
diff --git a/mindsdb/integrations/handlers/web_handler/tests/__init__.py b/mindsdb/integrations/handlers/web_handler/tests/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/mindsdb/integrations/handlers/web_handler/tests/example_data.py b/mindsdb/integrations/handlers/web_handler/tests/example_data.py
deleted file mode 100644
index 2b2708d351a..00000000000
--- a/mindsdb/integrations/handlers/web_handler/tests/example_data.py
+++ /dev/null
@@ -1,18 +0,0 @@
-PDF_CONTENT = (
-    b"%PDF-1.7\n\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n\n2 0 obj\n<< /Type /Pages "
-    b"/Kids [3 0 R] /Count 1 >>\nendobj\n\n3 0 obj\n<< /Type /Page /Parent 2 0 R /Contents 4 0 R "
-    b">>\nendobj\n\n4 0 obj\n<< /Length 22 >>\nstream\nBT\n/Helvetica 12 Tf\n1 0 0 1 50 700 Tm\n("
-    b"Hello, this is a test!) Tj\nET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 "
-    b"f\n0000000010 00000 n\n0000000077 00000 n\n0000000122 00000 n\n0000000203 00000 n\n0000000277 "
-    b"00000 n\ntrailer\n<< /Size 5 /Root 1 0 R >>\nstartxref\n343\n%%EOF\n "
-)
-
-BROKEN_PDF_CONTENT = b"%PDF-1.4\n\nThis is not a valid PDF file content\n"
-
-HTML_SAMPLE_1 = "<h1>Heading One</h1><h2>Heading Two</h2><ul><li>item1</li><li>item2</li><li>item3</li></ul>"
-
-MARKDOWN_SAMPLE_1 = "# Heading One\n\n## Heading Two\n\n* item1\n* item2\n* item3\n\n"
-
-HTML_SAMPLE_2 = '<h3>Heading</h3><p>text</p><a href="https://google.com">link</a><ul><ol>item1</ol></ul>'
-
-MARKDOWN_SAMPLE_2 = "### Heading\n\ntext\n\n[link](https://google.com)\n\n\n\n"
diff --git a/mindsdb/integrations/handlers/web_handler/tests/test_helpers.py b/mindsdb/integrations/handlers/web_handler/tests/test_helpers.py
deleted file mode 100644
index f67e64bcf7e..00000000000
--- a/mindsdb/integrations/handlers/web_handler/tests/test_helpers.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from unittest.mock import patch
-
-import pytest
-from bs4 import BeautifulSoup
-from fitz.fitz import FileDataError
-
-from mindsdb.integrations.handlers.web_handler import urlcrawl_helpers as C
-from mindsdb.integrations.handlers.web_handler.tests import example_data as D
-
-
-class TestPDFToMarkdownTest:
-    @patch("requests.Response")
-    def test_pdf_to_markdown(self, mock_response) -> None:
-        response = mock_response.return_value
-        response.content = D.PDF_CONTENT
-        result = C.pdf_to_markdown(response)
-        assert "Hello, this is a test!" in result
-
-    @patch("requests.Response")
-    def test_broken_pdf_to_markdown(self, mock_response) -> None:
-        response = mock_response.return_value
-        response.content = D.BROKEN_PDF_CONTENT
-
-        with pytest.raises(FileDataError) as excinfo:
-            C.pdf_to_markdown(response)
-
-        assert str(excinfo.value) == "cannot open broken document"
-
-
-@pytest.mark.parametrize(
-    "url, result",
-    [
-        ("google", False),
-        ("google.com", False),
-        ("https://google.com", True),
-        ("", False),
-    ],
-)
-def test_url_validation(url: str, result: bool) -> None:
-    assert C.is_valid(url) == result
-
-
-@pytest.mark.parametrize(
-    "html, markdown",
-    [(D.HTML_SAMPLE_1, D.MARKDOWN_SAMPLE_1), (D.HTML_SAMPLE_2, D.MARKDOWN_SAMPLE_2)],
-)
-def test_get_readable_text_from_soup(html: str, markdown: str) -> None:
-    soup = BeautifulSoup(html, "html.parser")
-    assert markdown == C.get_readable_text_from_soup(soup)
diff --git a/mindsdb/integrations/handlers/web_handler/web_handler.py b/mindsdb/integrations/handlers/web_handler/web_handler.py
index 885b377dcad..e1613cd502e 100644
--- a/mindsdb/integrations/handlers/web_handler/web_handler.py
+++ b/mindsdb/integrations/handlers/web_handler/web_handler.py
@@ -14,40 +14,38 @@
 class CrawlerTable(APITable):
 
     def select(self, query: ast.Select) -> pd.DataFrame:
+        """
+        Selects data from the provided websites
 
+        Args:
+            query (ast.Select): Given SQL SELECT query
+
+        Returns:
+            dataframe: Dataframe containing the crawled data
+
+        Raises:
+            NotImplementedError: If the query is not supported
+        """
         conditions = extract_comparison_conditions(query.where)
         urls = []
-        for op, arg1, arg2 in conditions:
-
-            if op == 'or':
+        for operator, arg1, arg2 in conditions:
+            if operator == 'or':
                 raise NotImplementedError('OR is not supported')
-
             if arg1 == 'url':
-                url = arg2
-
-                if op == '=':
-                    urls = [str(url)]
-                elif op == 'in':
-                    if type(url) == str:
-                        urls = [str(url)]
-                    else:
-                        urls = url
+                if operator in ['=', 'in']:
+                    urls = [str(arg2)] if isinstance(arg2, str) else arg2
                 else:
                     raise NotImplementedError('Invalid URL format. Please provide a single URL like url = "example.com" or'
                                               'multiple URLs using the format url IN ("url1", "url2", ...)')
-            else:
-                pass
 
         if len(urls) == 0:
             raise NotImplementedError(
                 'You must specify what url you want to crawl, for example: SELECT * FROM crawl WHERE url = "someurl"')
 
         if query.limit is None:
-            raise NotImplementedError('You must specify a LIMIT which defines the number of pages to crawl')
-        limit = query.limit.value
+            raise NotImplementedError('You must specify a LIMIT clause which defines the number of pages to crawl')
 
-        if limit < 0:
-            limit = 0
+        limit = query.limit.value
 
         config = Config()
         is_cloud = config.get("cloud", False)
@@ -66,6 +64,9 @@ def select(self, query: ast.Select) -> pd.DataFrame:
         return result
 
     def get_columns(self):
+        """
+        Returns the columns of the crawler table
+        """
         return [
             'url',
             'text_content',
@@ -74,11 +75,9 @@ def get_columns(self):
 
 
 class WebHandler(APIHandler):
-    """A class for handling crawling content from websites.
-
-    Attributes:
     """
-
+    Web handler, handling crawling content from websites.
+    """
     def __init__(self, name=None, **kwargs):
         super().__init__(name)
         crawler = CrawlerTable(self)
diff --git a/tests/unit/handlers/test_web.py b/tests/unit/handlers/test_web.py
new file mode 100644
index 00000000000..9f724639d8a
--- /dev/null
+++ b/tests/unit/handlers/test_web.py
@@ -0,0 +1,130 @@
+import unittest
+from mindsdb.integrations.libs.api_handler_exceptions import TableAlreadyExists
+from mindsdb.integrations.handlers.web_handler.web_handler import WebHandler
+from mindsdb.integrations.handlers.web_handler.web_handler import CrawlerTable
+from mindsdb.integrations.handlers.web_handler import urlcrawl_helpers as helpers
+from unittest.mock import patch, MagicMock
+import concurrent.futures
+import pytest
+from bs4 import BeautifulSoup
+import pandas as pd
+
+
+class TestWebsHandler(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.handler = WebHandler(name='test_web_handler')
+
+    def test_crawler_already_registered(self):
+        with self.assertRaises(TableAlreadyExists):
+            self.handler._register_table('crawler', CrawlerTable)
+
+
+PDF_CONTENT = (
+    b"%PDF-1.7\n\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n\n2 0 obj\n<< /Type /Pages "
+    b"/Kids [3 0 R] /Count 1 >>\nendobj\n\n3 0 obj\n<< /Type /Page /Parent 2 0 R /Contents 4 0 R "
+    b">>\nendobj\n\n4 0 obj\n<< /Length 22 >>\nstream\nBT\n/Helvetica 12 Tf\n1 0 0 1 50 700 Tm\n("
+    b"Hello, this is a test!) Tj\nET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 "
+    b"f\n0000000010 00000 n\n0000000077 00000 n\n0000000122 00000 n\n0000000203 00000 n\n0000000277 "
+    b"00000 n\ntrailer\n<< /Size 5 /Root 1 0 R >>\nstartxref\n343\n%%EOF\n "
+)
+
+BROKEN_PDF_CONTENT = b"%PDF-1.4\n\nThis is not a valid PDF file content\n"
+
+HTML_SAMPLE_1 = "<h1>Heading One</h1><h2>Heading Two</h2>"
+
+MARKDOWN_SAMPLE_1 = "# Heading One \n\n ## Heading Two"
+
+
+class TestPDFToMarkdownTest:
+    @patch("requests.Response")
+    def test_pdf_to_markdown(self, mock_response) -> None:
+        response = mock_response.return_value
+        response.content = PDF_CONTENT
+        result = helpers.pdf_to_markdown(response)
+        assert "Hello, this is a test!" in result
+
+    @patch("requests.Response")
+    def test_broken_pdf_to_markdown(self, mock_response) -> None:
+        response = mock_response.return_value
+        response.content = BROKEN_PDF_CONTENT
+
+        with pytest.raises(Exception, match='Failed to process PDF data: cannot open broken document'):
+            helpers.pdf_to_markdown(response)
+
+
+@pytest.mark.parametrize(
+    "url, result",
+    [
+        ("google", False),
+        ("google.com", False),
+        ("https://google.com", True),
+        ("", False),
+    ],
+)
+def test_url_validation(url: str, result: bool) -> None:
+    assert helpers.is_valid(url) == result
+
+
+@pytest.mark.parametrize(
+    "html, markdown",
+    [(HTML_SAMPLE_1, MARKDOWN_SAMPLE_1)],
+)
+def test_get_readable_text_from_soup(html: str, markdown: str) -> None:
+    soup = BeautifulSoup(html, "html.parser")
+    import re
+    expected = re.sub(r'\s+', ' ', markdown).strip()
+    actual = re.sub(r'\s+', ' ', helpers.get_readable_text_from_soup(soup)).strip()
+
+    assert expected == actual
+
+
+@patch("mindsdb.integrations.handlers.web_handler.urlcrawl_helpers.get_all_website_links")
+@patch("concurrent.futures.ProcessPoolExecutor")
+def test_parallel_get_all_website_links(mock_executor, mock_get_links):
+    # Setup: Mock the get_all_website_links function to return a list of links
+    mock_get_links.return_value = ["link1", "link2", "link3"]
+
+    # Setup: Mock the ProcessPoolExecutor class to return a mock executor
+    mock_executor_instance = MagicMock()
+    mock_executor.return_value.__enter__.return_value = mock_executor_instance
+
+    # Setup: Mock the executor to return a future that immediately completes with a result
+    mock_future = concurrent.futures.Future()
+    mock_future.set_result(["link1", "link2", "link3"])
+    mock_executor_instance.submit.return_value = mock_future
+
+    # Call the function with a list of URLs
+    urls = ["url1", "url2", "url3"]
+    result = helpers.parallel_get_all_website_links(urls)
+
+    # Assert: Check if the function returns the expected result
+    expected = {
+        "url1": ["link1", "link2", "link3"],
+        "url2": ["link1", "link2", "link3"],
+        "url3": ["link1", "link2", "link3"],
+    }
+    assert result == expected
+
+    # Assert: Check if the mocks were called as expected
+    mock_get_links.assert_called()
+
+
+def test_dict_to_dataframe():
+    # Setup: Create a dictionary of dictionaries
+    data = {
+        "row1": {"column1": 1, "column2": 2, "column3": 3},
+        "row2": {"column1": 4, "column2": 5, "column3": 6},
+        "row3": {"column1": 7, "column2": 8, "column3": 9},
+    }
+
+    # Call the function with the data, ignoring "column2" and setting the index name to "ID"
+    df = helpers.dict_to_dataframe(data, columns_to_ignore=["column2"], index_name="ID")
+
+    # Assert: Check if the DataFrame has the expected structure
+    expected = pd.DataFrame({
+        "column1": {"row1": 1, "row2": 4, "row3": 7},
+        "column3": {"row1": 3, "row2": 6, "row3": 9},
+    })
+    expected.index.name = "ID"
+    pd.testing.assert_frame_equal(df, expected)

From 97caf4bf26e01f7f8211f8ea0eadbfb75df877b6 Mon Sep 17 00:00:00 2001
From: ZoranPandovski <zoran.pandovski@gmail.com>
Date: Fri, 10 May 2024 11:58:44 +0200
Subject: [PATCH 08/11] Update test

---
 tests/unit/handlers/test_web.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/handlers/test_web.py b/tests/unit/handlers/test_web.py
index 9f724639d8a..d91670d0030 100644
--- a/tests/unit/handlers/test_web.py
+++ b/tests/unit/handlers/test_web.py
@@ -49,7 +49,7 @@ def test_broken_pdf_to_markdown(self, mock_response) -> None:
         response = mock_response.return_value
         response.content = BROKEN_PDF_CONTENT
 
-        with pytest.raises(Exception, match='Failed to process PDF data: cannot open broken document'):
+        with pytest.raises(Exception, match="Failed to process PDF data"):
             helpers.pdf_to_markdown(response)
 
 

From c3f9b97aa77e5ae698ae8a16d95ec3594440e838 Mon Sep 17 00:00:00 2001
From: ZoranPandovski <zoran.pandovski@gmail.com>
Date: Tue, 14 May 2024 12:02:31 +0200
Subject: [PATCH 09/11] Review changes

---
 docs/integrations/app-integrations/web-crawler.mdx   |  4 ++--
 mindsdb/integrations/handlers/web_handler/README.md  |  4 ++--
 .../integrations/handlers/web_handler/web_handler.py | 12 ++++++++++++
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/docs/integrations/app-integrations/web-crawler.mdx b/docs/integrations/app-integrations/web-crawler.mdx
index cf24de406db..b744f9d08ca 100644
--- a/docs/integrations/app-integrations/web-crawler.mdx
+++ b/docs/integrations/app-integrations/web-crawler.mdx
@@ -25,13 +25,13 @@ CREATE DATABASE my_web
 WITH ENGINE = 'web';
 ```
 <Tip>
-The above query creates a database called `my_web`. This database by default have a table called `crawler` that we can use to crawl data from a given url/urls.
+The above query creates a database called `my_web`. This database by default has a table called `crawler` that we can use to crawl data from a given url/urls.
 </Tip>
 
 ## Usage
 
 <Note>
-Specifying a query `LIMIT` is required. To crawl all pages on a site, consider setting the limit to a high value, such as 10,000, which exceeds the expected number of pages. Be aware that setting a higher limit may result in longer response times.
+Specifying a `LIMIT` clause is required. To crawl all pages on a site, consider setting the limit to a high value, such as 10,000, which exceeds the expected number of pages. Be aware that setting a higher limit may result in longer response times.
 </Note>
 
 ### Get Websites Content
diff --git a/mindsdb/integrations/handlers/web_handler/README.md b/mindsdb/integrations/handlers/web_handler/README.md
index cf24de406db..b744f9d08ca 100644
--- a/mindsdb/integrations/handlers/web_handler/README.md
+++ b/mindsdb/integrations/handlers/web_handler/README.md
@@ -25,13 +25,13 @@ CREATE DATABASE my_web
 WITH ENGINE = 'web';
 ```
 <Tip>
-The above query creates a database called `my_web`. This database by default have a table called `crawler` that we can use to crawl data from a given url/urls.
+The above query creates a database called `my_web`. This database by default has a table called `crawler` that we can use to crawl data from a given url/urls.
 </Tip>
 
 ## Usage
 
 <Note>
-Specifying a query `LIMIT` is required. To crawl all pages on a site, consider setting the limit to a high value, such as 10,000, which exceeds the expected number of pages. Be aware that setting a higher limit may result in longer response times.
+Specifying a `LIMIT` clause is required. To crawl all pages on a site, consider setting the limit to a high value, such as 10,000, which exceeds the expected number of pages. Be aware that setting a higher limit may result in longer response times.
 </Note>
 
 ### Get Websites Content
diff --git a/mindsdb/integrations/handlers/web_handler/web_handler.py b/mindsdb/integrations/handlers/web_handler/web_handler.py
index e1613cd502e..42a42bf500e 100644
--- a/mindsdb/integrations/handlers/web_handler/web_handler.py
+++ b/mindsdb/integrations/handlers/web_handler/web_handler.py
@@ -1,5 +1,6 @@
 import pandas as pd
 
+from mindsdb.integrations.libs.response import HandlerStatusResponse
 from mindsdb_sql.parser import ast
 
 from mindsdb.integrations.libs.api_handler import APIHandler, APITable
@@ -82,3 +83,14 @@ def __init__(self, name=None, **kwargs):
         super().__init__(name)
         crawler = CrawlerTable(self)
         self._register_table('crawler', crawler)
+
+    def check_connection(self) -> HandlerStatusResponse:
+        """
+        Checks the connection to the web handler
+        @TODO: Implement a better check for the connection
+
+        Returns:
+            HandlerStatusResponse: Response containing the status of the connection. Hardcoded to True for now.
+        """
+        response = HandlerStatusResponse(True)
+        return response
\ No newline at end of file

From 19bb55bcf9f327d1a4326f6fc653601daeed2985 Mon Sep 17 00:00:00 2001
From: ZoranPandovski <zoran.pandovski@gmail.com>
Date: Tue, 14 May 2024 14:05:44 +0200
Subject: [PATCH 10/11] Lint

---
 mindsdb/integrations/handlers/web_handler/web_handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindsdb/integrations/handlers/web_handler/web_handler.py b/mindsdb/integrations/handlers/web_handler/web_handler.py
index 42a42bf500e..469d3723f95 100644
--- a/mindsdb/integrations/handlers/web_handler/web_handler.py
+++ b/mindsdb/integrations/handlers/web_handler/web_handler.py
@@ -93,4 +93,4 @@ def check_connection(self) -> HandlerStatusResponse:
             HandlerStatusResponse: Response containing the status of the connection. Hardcoded to True for now.
         """
         response = HandlerStatusResponse(True)
-        return response
\ No newline at end of file
+        return response

From f999bf50c629f091284f68ee46ddf5e673635ee7 Mon Sep 17 00:00:00 2001
From: ZoranPandovski <zoran.pandovski@gmail.com>
Date: Tue, 14 May 2024 15:40:20 +0200
Subject: [PATCH 11/11] Add more tests for the SELECT validation

---
 tests/unit/handlers/test_web.py | 180 ++++++++++++++++++--------------
 1 file changed, 102 insertions(+), 78 deletions(-)

diff --git a/tests/unit/handlers/test_web.py b/tests/unit/handlers/test_web.py
index d91670d0030..2c93147559a 100644
--- a/tests/unit/handlers/test_web.py
+++ b/tests/unit/handlers/test_web.py
@@ -7,7 +7,6 @@
 import concurrent.futures
 import pytest
 from bs4 import BeautifulSoup
-import pandas as pd
 
 
 class TestWebsHandler(unittest.TestCase):
@@ -36,7 +35,7 @@ def test_crawler_already_registered(self):
 MARKDOWN_SAMPLE_1 = "# Heading One \n\n ## Heading Two"
 
 
-class TestPDFToMarkdownTest:
+class TestWebHelpers(unittest.TestCase):
     @patch("requests.Response")
     def test_pdf_to_markdown(self, mock_response) -> None:
         response = mock_response.return_value
@@ -52,79 +51,104 @@ def test_broken_pdf_to_markdown(self, mock_response) -> None:
         with pytest.raises(Exception, match="Failed to process PDF data"):
             helpers.pdf_to_markdown(response)
 
-
-@pytest.mark.parametrize(
-    "url, result",
-    [
-        ("google", False),
-        ("google.com", False),
-        ("https://google.com", True),
-        ("", False),
-    ],
-)
-def test_url_validation(url: str, result: bool) -> None:
-    assert helpers.is_valid(url) == result
-
-
-@pytest.mark.parametrize(
-    "html, markdown",
-    [(HTML_SAMPLE_1, MARKDOWN_SAMPLE_1)],
-)
-def test_get_readable_text_from_soup(html: str, markdown: str) -> None:
-    soup = BeautifulSoup(html, "html.parser")
-    import re
-    expected = re.sub(r'\s+', ' ', markdown).strip()
-    actual = re.sub(r'\s+', ' ', helpers.get_readable_text_from_soup(soup)).strip()
-
-    assert expected == actual
-
-
-@patch("mindsdb.integrations.handlers.web_handler.urlcrawl_helpers.get_all_website_links")
-@patch("concurrent.futures.ProcessPoolExecutor")
-def test_parallel_get_all_website_links(mock_executor, mock_get_links):
-    # Setup: Mock the get_all_website_links function to return a list of links
-    mock_get_links.return_value = ["link1", "link2", "link3"]
-
-    # Setup: Mock the ProcessPoolExecutor class to return a mock executor
-    mock_executor_instance = MagicMock()
-    mock_executor.return_value.__enter__.return_value = mock_executor_instance
-
-    # Setup: Mock the executor to return a future that immediately completes with a result
-    mock_future = concurrent.futures.Future()
-    mock_future.set_result(["link1", "link2", "link3"])
-    mock_executor_instance.submit.return_value = mock_future
-
-    # Call the function with a list of URLs
-    urls = ["url1", "url2", "url3"]
-    result = helpers.parallel_get_all_website_links(urls)
-
-    # Assert: Check if the function returns the expected result
-    expected = {
-        "url1": ["link1", "link2", "link3"],
-        "url2": ["link1", "link2", "link3"],
-        "url3": ["link1", "link2", "link3"],
-    }
-    assert result == expected
-
-    # Assert: Check if the mocks were called as expected
-    mock_get_links.assert_called()
-
-
-def test_dict_to_dataframe():
-    # Setup: Create a dictionary of dictionaries
-    data = {
-        "row1": {"column1": 1, "column2": 2, "column3": 3},
-        "row2": {"column1": 4, "column2": 5, "column3": 6},
-        "row3": {"column1": 7, "column2": 8, "column3": 9},
-    }
-
-    # Call the function with the data, ignoring "column2" and setting the index name to "ID"
-    df = helpers.dict_to_dataframe(data, columns_to_ignore=["column2"], index_name="ID")
-
-    # Assert: Check if the DataFrame has the expected structure
-    expected = pd.DataFrame({
-        "column1": {"row1": 1, "row2": 4, "row3": 7},
-        "column3": {"row1": 3, "row2": 6, "row3": 9},
-    })
-    expected.index.name = "ID"
-    pd.testing.assert_frame_equal(df, expected)
+    def test_url_validation(self):
+        assert helpers.is_valid('https://google.com') is True
+        assert helpers.is_valid('google.com') is False
+
+    def test_get_readable_text_from_soup(self) -> None:
+        soup = BeautifulSoup(HTML_SAMPLE_1, "html.parser")
+        import re
+        expected = re.sub(r'\s+', ' ', MARKDOWN_SAMPLE_1).strip()
+        actual = re.sub(r'\s+', ' ', helpers.get_readable_text_from_soup(soup)).strip()
+
+        assert expected == actual
+
+    @patch("mindsdb.integrations.handlers.web_handler.urlcrawl_helpers.get_all_website_links")
+    @patch("concurrent.futures.ProcessPoolExecutor")
+    def test_parallel_get_all_website_links(self, mock_executor, mock_get_links):
+        # Setup: Mock the get_all_website_links function to return a list of links
+        mock_get_links.return_value = ["link1", "link2", "link3"]
+
+        # Setup: Mock the ProcessPoolExecutor class to return a mock executor
+        mock_executor_instance = MagicMock()
+        mock_executor.return_value.__enter__.return_value = mock_executor_instance
+
+        # Setup: Mock the executor to return a future that immediately completes with a result
+        mock_future = concurrent.futures.Future()
+        mock_future.set_result(["link1", "link2", "link3"])
+        mock_executor_instance.submit.return_value = mock_future
+
+        # Call the function with a list of URLs
+        urls = ["url1", "url2", "url3"]
+        result = helpers.parallel_get_all_website_links(urls)
+
+        # Assert: Check if the function returns the expected result
+        expected = {
+            "url1": ["link1", "link2", "link3"],
+            "url2": ["link1", "link2", "link3"],
+            "url3": ["link1", "link2", "link3"],
+        }
+        assert result == expected
+
+        # Assert: Check if the mocks were called as expected
+        mock_get_links.assert_called()
+
+
+class TestWebHandler(unittest.TestCase):
+
+    @patch('mindsdb.integrations.handlers.web_handler.web_handler.extract_comparison_conditions')
+    def test_select_with_or_operator_raise_error(self, mock_extract_comparison_conditions):
+        mock_extract_comparison_conditions.return_value = [('or', 'url', 'example.com')]
+
+        crawler_table = CrawlerTable(handler=MagicMock())
+        mock_query = MagicMock()
+        mock_ast = MagicMock()
+        mock_ast.get_type.return_value = 'OR'
+
+        mock_query.where = mock_ast
+        with self.assertRaises(NotImplementedError) as context:
+            crawler_table.select(mock_query)
+        self.assertTrue('OR is not supported' in str(context.exception))
+
+    @patch('mindsdb.integrations.handlers.web_handler.web_handler.extract_comparison_conditions')
+    def test_select_with_invalid_url_format(self, mock_extract_comparison_conditions):
+        mock_extract_comparison_conditions.return_value = [('WHERE', 'url', 'example.com')]
+
+        crawler_table = CrawlerTable(handler=MagicMock())
+        mock_query = MagicMock()
+        mock_ast = MagicMock()
+        mock_ast.get_type.return_value = 'WHERE URL ("example.com")'
+
+        mock_query.where = mock_ast
+        with self.assertRaises(NotImplementedError) as context:
+            crawler_table.select(mock_query)
+        self.assertTrue('Invalid URL format.' in str(context.exception))
+
+    @patch('mindsdb.integrations.handlers.web_handler.web_handler.extract_comparison_conditions')
+    def test_select_with_missing_url_(self, mock_extract_comparison_conditions):
+        mock_extract_comparison_conditions.return_value = [('WHERE', 'id', '1')]
+
+        crawler_table = CrawlerTable(handler=MagicMock())
+        mock_query = MagicMock()
+        mock_ast = MagicMock()
+        mock_ast.get_type.return_value = 'WHERE ID ("1")'
+
+        mock_query.where = mock_ast
+        with self.assertRaises(NotImplementedError) as context:
+            crawler_table.select(mock_query)
+        self.assertTrue('You must specify what url you want to craw' in str(context.exception))
+
+    @patch('mindsdb.integrations.handlers.web_handler.web_handler.extract_comparison_conditions')
+    def test_select_with_missing_limit(self, mock_extract_comparison_conditions):
+        mock_extract_comparison_conditions.return_value = [('=', 'url', 'https://docs.mindsdb.com')]
+
+        crawler_table = CrawlerTable(handler=MagicMock())
+        mock_query = MagicMock()
+        mock_ast = MagicMock()
+        mock_ast.get_type.return_value = 'URL = "https://docs.mindsdb.com"'
+
+        mock_query.where = mock_ast
+        mock_query.limit = None
+        with self.assertRaises(NotImplementedError) as context:
+            crawler_table.select(mock_query)
+        self.assertTrue('You must specify a LIMIT clause' in str(context.exception))