Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 49 additions & 10 deletions mindsdb_sdk/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,21 +119,41 @@ def add_file(self, file_path: str, description: str, knowledge_base: str = None)
"""
self.collection.add_file(self.name, file_path, description, knowledge_base)

def add_webpages(self, urls: List[str], description: str, knowledge_base: str = None):
def add_webpages(
self,
urls: List[str],
description: str,
knowledge_base: str = None,
crawl_depth: int = 1,
filters: List[str] = None):
"""
Add a list of crawled URLs to the agent for retrieval.
Add a crawled URL to the agent for retrieval.

:param urls: List of URLs to be crawled and added.
:param urls: URLs of pages to be crawled and added.
:param description: Description of the webpages. Used by agent to know when to do retrieval.
:param knowledge_base: Name of an existing knowledge base to be used. Will create a default knowledge base if not given.
:param crawl_depth: How deep to crawl from each base URL. 0 = scrape given URLs only, -1 = default max
:param filters: Include only URLs that match these regex patterns
"""
self.collection.add_webpages(self.name, urls, description, knowledge_base)
self.collection.add_webpages(self.name, urls, description, knowledge_base=knowledge_base, crawl_depth=crawl_depth, filters=filters)

def add_webpage(self, url: str, description: str, knowledge_base: str = None):
def add_webpage(
self,
url: str,
description: str,
knowledge_base: str = None,
crawl_depth: int = 1,
filters: List[str] = None):
"""
Add a crawled URL to the agent for retrieval.

:param url: URL of the page to be crawled and added.
:param description: Description of the webpages. Used by agent to know when to do retrieval.
:param knowledge_base: Name of an existing knowledge base to be used. Will create a default knowledge base if not given.
:param crawl_depth: How deep to crawl from each base URL. 0 = scrape given URLs only, -1 = default max
:param filters: Include only URLs that match these regex patterns
"""
self.collection.add_webpage(self.name, url, description, knowledge_base)
self.collection.add_webpage(self.name, url, description, knowledge_base=knowledge_base, crawl_depth=crawl_depth, filters=filters)

def add_database(self, database: str, tables: List[str], description: str):
"""
Expand Down Expand Up @@ -313,14 +333,24 @@ def add_file(self, name: str, file_path: str, description: str, knowledge_base:
"""
self.add_files(name, [file_path], description, knowledge_base)

def add_webpages(self, name: str, urls: List[str], description: str, knowledge_base: str = None):
def add_webpages(
self,
name: str,
urls: List[str],
description: str,
knowledge_base: str = None,
crawl_depth: int = 1,
filters: List[str] = None
):
"""
Add a list of webpages to the agent for retrieval.

:param name: Name of the agent
:param urls: List of URLs of the webpages to be added.
:param description: Description of the webpages. Used by agent to know when to do retrieval.
:param knowledge_base: Name of an existing knowledge base to be used. Will create a default knowledge base if not given.
:param crawl_depth: How deep to crawl from each base URL. 0 = scrape given URLs only
:param filters: Include only URLs that match these regex patterns
"""
if not urls:
return
Expand All @@ -339,7 +369,7 @@ def add_webpages(self, name: str, urls: List[str], description: str, knowledge_b
kb = self._create_default_knowledge_base(agent, kb_name)

# Insert crawled webpage.
kb.insert_webpages(urls)
kb.insert_webpages(urls, crawl_depth=crawl_depth, filters=filters)

# Make sure skill name is unique.
skill_name = f'{domain}{path}_retrieval_skill_{uuid4()}'
Expand All @@ -351,16 +381,25 @@ def add_webpages(self, name: str, urls: List[str], description: str, knowledge_b
agent.skills.append(webpage_retrieval_skill)
self.update(agent.name, agent)

def add_webpage(self, name: str, url: str, description: str, knowledge_base: str = None):
def add_webpage(
self,
name: str,
url: str,
description: str,
knowledge_base: str = None,
crawl_depth: int = 1,
filters: List[str] = None):
"""
Add a webpage to the agent for retrieval.

:param name: Name of the agent
:param file_path: URL of the webpage to be added, or name of existing webpage.
:param description: Description of the webpage. Used by agent to know when to do retrieval.
:param knowledge_base: Name of an existing knowledge base to be used. Will create a default knowledge base if not given.
:param crawl_depth: How deep to crawl from each base URL. 0 = scrape given URLs only
:param filters: Include only URLs that match these regex patterns
"""
self.add_webpages(name, [url], description, knowledge_base)
self.add_webpages(name, [url], description, knowledge_base=knowledge_base, crawl_depth=crawl_depth, filters=filters)

def add_database(self, name: str, database: str, tables: List[str], description: str):
"""
Expand Down
11 changes: 7 additions & 4 deletions mindsdb_sdk/connectors/rest_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,13 +413,16 @@ def insert_files_into_knowledge_base(self, project: str, knowledge_base_name: st
return r.json()

@_try_relogin
def insert_webpages_into_knowledge_base(self, project: str, knowledge_base_name: str, urls: List[str]):
def insert_webpages_into_knowledge_base(self, project: str, knowledge_base_name: str, urls: List[str], crawl_depth: int = 1, filters: List[str] = None):
data = {
'urls': urls,
'crawl_depth': crawl_depth,
'filters': [] if filters is None else filters
}
r = self.session.put(
self.url + f'/api/projects/{project}/knowledge_bases/{knowledge_base_name}',
json={
'knowledge_base': {
'urls': urls
}
'knowledge_base': data
}
)
_raise_for_status(r)
Expand Down
10 changes: 7 additions & 3 deletions mindsdb_sdk/knowledge_bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,15 @@ def insert_files(self, file_paths: List[str]):
"""
self.api.insert_files_into_knowledge_base(self.project.name, self.name, file_paths)

def insert_webpages(self, urls: List[str]):
def insert_webpages(self, urls: List[str], crawl_depth: int = 1, filters: List[str] = None):
"""
Insert data from crawled URLs to knowledge base
Insert data from crawled URLs to knowledge base.

:param urls: URLs to be crawled and inserted.
:param crawl_depth: How deep to crawl from each base URL. 0 = scrape given URLs only
:param filters: Include only URLs that match these regex patterns
"""
self.api.insert_webpages_into_knowledge_base(self.project.name, self.name, urls)
self.api.insert_webpages_into_knowledge_base(self.project.name, self.name, urls, crawl_depth=crawl_depth, filters=filters)

def insert(self, data: Union[pd.DataFrame, Query, dict]):
"""
Expand Down
21 changes: 19 additions & 2 deletions mindsdb_sdk/utils/mind.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,32 @@ class DatabaseConfig(DataSourceConfig):

class FileConfig(DataSourceConfig):
"""
Represents a colection of files that can be made available to a Mind.
Represents a collection of files that can be made available to a Mind.
"""

# Local file paths and/or URLs.
paths: List[str] = []
paths: List[str]

# TODO: Configure Vector storage. Use defaults for now.


class WebConfig(DataSourceConfig):
"""
Represents a collection of URLs that can be crawled and made available to a Mind.
"""

# Base URLs to crawl from.
urls: List[str]

# Scrapes all URLs found in the starting page (default).
# 0 = scrape provided URLs only
# -1 = no limit (we should set our own sensible limit)
crawl_depth: int = 1

# Include only URLs that match regex patterns.
filters: List[str] = [ ]


# Create mind entity util function
def create_mind(
base_url: str,
Expand Down