Merge branch 'main' into verify-web

mindsdb · May 9, 2024 · e6576c2 · e6576c2
2 parents a6d5a7e + f013cfc
commit e6576c2
Show file tree

Hide file tree

Showing 17 changed files with 43 additions and 79 deletions.
diff --git a/.github/workflows/build_deploy_prod.yml b/.github/workflows/build_deploy_prod.yml
@@ -101,7 +101,7 @@ jobs:
     # We only want to run one deploy job for an env at a time
     # Don't cancel in progress jobs because it may be for a different PR
     concurrency:
-      group: deploy-${{ matrix.deploy-env }}
+      group: deploy-prod
       cancel-in-progress: false
     steps:
       - uses: FranzDiebold/github-env-vars-action@v2

diff --git a/.github/workflows/build_deploy_staging.yml b/.github/workflows/build_deploy_staging.yml
@@ -54,7 +54,7 @@ jobs:
     # We only want to run one deploy job for an env at a time
     # Don't cancel in progress jobs because it may be for a different PR
     concurrency:
-      group: deploy-${{ matrix.deploy-env }}
+      group: deploy-${{ matrix.environment }}
       cancel-in-progress: false
     steps:
       - uses: FranzDiebold/github-env-vars-action@v2

diff --git a/docs/integrations/ai-engines/statsforecast.mdx b/docs/integrations/ai-engines/statsforecast.mdx
@@ -233,7 +233,7 @@ The `engine` parameter in the `training_options` clause specifies the ML engine
 We can check the training status with the following query:
 
 ```bash
-> db.getCollection('models').find({
+> db.models.find({
       name: 'quarterly_expenditure_forecaster'
   })
 ```
@@ -244,13 +244,13 @@ Once the model status is `complete`, the behavior is the same as with any other
 
 ```bash
 > db.quarterly_expenditure_forecaster.find({
-      collection: 'mongo_demo_db.historical_expenditures', 
-      query:  { 
-          "$where": "this.month > latest and this.category = 'food'" 
-      }
+      "collection": "mongo_pred_01.historical_expenditures",
+      "query": {"category": "food"}
   }).limit(3)
 ```
 
+By default the forecasts are made for `month > LATEST`.
+
 Here is the output data:
 
 ```bash

diff --git a/docs/mint.json b/docs/mint.json
@@ -13,7 +13,8 @@
   },
   "feedback": {
     "suggestEdit": true,
-    "raiseIssue": true
+    "raiseIssue": true,
+    "thumbsRating": true
   },
   "openapi": "https://raw.githubusercontent.com/mindsdb/mindsdb/openapi-specs/mindsdb/api/http/openapi.yml",
   "api": {
@@ -794,14 +795,6 @@
                       "sdks/python/refresh_job",
                       "sdks/python/get_history"
                   ]
-              },
-              {
-                "group": "AI Agents",
-                "pages": [
-                  "sdks/python/agents",
-                  "sdks/python/agents_skills",
-                  "sdks/python/agents_knowledge_bases"
-                ]
               }
           ]
         },

diff --git a/docs/use-cases/data_enrichment/question-answering-inside-mongodb-with-openai.mdx b/docs/use-cases/data_enrichment/question-answering-inside-mongodb-with-openai.mdx
@@ -91,7 +91,7 @@ Follow [this instruction](/integrations/ai-engines/openai#setup) to set up the O
 Once the `insertOne` method has started execution, we can check the status of the creation process with the following query:
 
 ```bash
-mindsdb> db.getCollection('models').find({
+mindsdb> db.models.find({
             'name': 'question_answering'
         })
 ```

diff --git a/docs/use-cases/data_enrichment/sentiment-analysis-inside-mongodb-with-openai.mdx b/docs/use-cases/data_enrichment/sentiment-analysis-inside-mongodb-with-openai.mdx
@@ -88,7 +88,7 @@ Follow [this instruction](/integrations/ai-engines/openai#setup) to set up the O
 Once the `insertOne` method has started execution, we can check the status of the creation process with the following query:
 
 ```bash
-mindsdb> db.getCollection('models').find({
+mindsdb> db.models.find({
             'name': 'sentiment_classifier'
         })
 ```

diff --git a/docs/use-cases/data_enrichment/text-summarization-inside-mongodb-with-openai.mdx b/docs/use-cases/data_enrichment/text-summarization-inside-mongodb-with-openai.mdx
@@ -88,7 +88,7 @@ Follow [this instruction](/integrations/ai-engines/openai#setup) to set up the O
 Once the `insertOne` method has started execution, we can check the status of the creation process with the following query:
 
 ```bash
-mindsdb> db.getCollection('models').find({
+mindsdb> db.models.find({
             'name': 'text_summarization'
         })
 ```

diff --git a/mindsdb/api/http/namespaces/knowledge_bases.py b/mindsdb/api/http/namespaces/knowledge_bases.py
@@ -10,11 +10,11 @@
 from mindsdb.api.executor.controllers.session_controller import SessionController
 from mindsdb.api.http.utils import http_error
 from mindsdb.metrics.metrics import api_endpoint_metrics
-from mindsdb.integrations.handlers.langchain_embedding_handler import construct_model_from_args
 from mindsdb.integrations.handlers.web_handler.urlcrawl_helpers import get_all_websites
 from mindsdb.interfaces.database.projects import ProjectController
 from mindsdb.interfaces.file.file_controller import FileController
 from mindsdb.integrations.utilities.rag.loaders.file_loader import FileLoader
+from mindsdb.integrations.utilities.rag.splitters.file_splitter import FileSplitter, FileSplitterConfig
 from mindsdb.interfaces.knowledge_base.controller import KnowledgeBaseTable
 from mindsdb.utilities import log
 
@@ -27,12 +27,9 @@
 ]
 
 
-def _insert_file_into_knowledge_base(table: KnowledgeBaseTable, file_name: str, embeddings_provider: str):
-    # import here to prevent the need to set OPENAI_API_KEY
-    from mindsdb.integrations.utilities.rag.splitters.file_splitter import FileSplitter, FileSplitterConfig
-
+def _insert_file_into_knowledge_base(table: KnowledgeBaseTable, file_name: str):
     file_controller = FileController()
-    splitter = FileSplitter(FileSplitterConfig(embeddings=construct_model_from_args({'class': embeddings_provider})))
+    splitter = FileSplitter(FileSplitterConfig())
     file_path = file_controller.get_file_path(file_name)
     loader = FileLoader(file_path)
     split_docs = []
@@ -108,7 +105,6 @@ def put(self, project_name: str, knowledge_base_name: str):
                 f'Project with name {project_name} does not exist'
             )
         try:
-            existing_kb = session.kb_controller.get(knowledge_base_name, project.id)
             table = session.kb_controller.get_table(knowledge_base_name, project.id)
         except ValueError:
             # Knowledge Base must exist.
@@ -122,12 +118,9 @@ def put(self, project_name: str, knowledge_base_name: str):
         files = kb.get('files', [])
         urls = kb.get('urls', [])
 
-        # Use same embeddings as knowledge base if possible.
-        embeddings_provider = existing_kb.embedding_model.learn_args.get('class', 'openai')
-
         # Load, split, & embed files into Knowledge Base.
         for file_name in files:
-            _insert_file_into_knowledge_base(table, file_name, embeddings_provider)
+            _insert_file_into_knowledge_base(table, file_name)
         # Crawl, split, & embed web pages into Knowledge Base.
         _insert_web_pages_into_knowledge_base(table, urls)
         return '', HTTPStatus.OK
diff --git a/mindsdb/integrations/handlers/email_handler/requirements.txt b/mindsdb/integrations/handlers/email_handler/requirements.txt
@@ -1,2 +1 @@
 chardet
-bs4==0.0.2
diff --git a/mindsdb/integrations/handlers/langchain_handler/langchain_handler.py b/mindsdb/integrations/handlers/langchain_handler/langchain_handler.py
@@ -217,15 +217,17 @@ def create_agent(self, df: pd.DataFrame, args: Dict=None, pred_args: Dict=None)
 
         # Set up embeddings model if needed.
         if args.get('mode') == 'retrieval':
-
             embeddings_args = args.pop('embedding_model_args', {})
 
             # no embedding model args provided, use default provider.
             if not embeddings_args:
+                embeddings_provider = self._get_embedding_model_provider(args)
                 logger.warning("'embedding_model_args' not found in input params, "
-                               f"Trying to use default provider: {DEFAULT_EMBEDDINGS_MODEL_PROVIDER}"
+                               f"Trying to use LLM provider: {embeddings_provider}"
                                )
-                embeddings_args['class'] = DEFAULT_EMBEDDINGS_MODEL_PROVIDER
+                embeddings_args['class'] = embeddings_provider
+                # Include API keys if present.
+                embeddings_args.update({k: v for k, v in args.items() if 'api_key' in k})
 
             # create embeddings model
             pred_args['embeddings_model'] = self._create_embeddings_model(embeddings_args)

diff --git a/mindsdb/integrations/handlers/langchain_handler/requirements.txt b/mindsdb/integrations/handlers/langchain_handler/requirements.txt
@@ -4,5 +4,7 @@ tiktoken==0.5.2
 anthropic==0.3.5
 langfuse==2.27.1  # Tracing
 litellm==1.35.0
+chromadb # Knowledge bases.
+langchain-experimental
 -r mindsdb/integrations/handlers/openai_handler/requirements.txt
 -r mindsdb/integrations/handlers/langchain_embedding_handler/requirements.txt
diff --git a/mindsdb/integrations/handlers/web_handler/requirements.txt b/mindsdb/integrations/handlers/web_handler/requirements.txt
@@ -1,3 +1,2 @@
-bs4
+html2text
 pymupdf
-html2text
diff --git a/mindsdb/integrations/utilities/rag/splitters/file_splitter.py b/mindsdb/integrations/utilities/rag/splitters/file_splitter.py
@@ -2,9 +2,6 @@
 from typing import Callable, List, Union
 
 from langchain_core.documents import Document
-from langchain_core.embeddings import Embeddings
-from langchain_experimental.text_splitter import SemanticChunker
-from langchain_openai import OpenAIEmbeddings
 from langchain_text_splitters import MarkdownHeaderTextSplitter, HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter
 
 from mindsdb.utilities import log
@@ -32,12 +29,8 @@ class FileSplitterConfig:
     chunk_size: int = DEFAULT_CHUNK_SIZE
     # How many characters each chunk should overlap. Not all splitters will adhere exactly to this (it's more of a guideline)
     chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
-    # Embeddings to use for semantic chunking (default is OpenAI)
-    embeddings: Embeddings = OpenAIEmbeddings()
     # Default recursive splitter to use for text files, or unsupported files
     recursive_splitter: RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-    # Semantic chunker to use for PDF splitting
-    semantic_chunker: SemanticChunker = SemanticChunker(embeddings)
     # Splitter to use for MD splitting
     markdown_splitter: MarkdownHeaderTextSplitter = MarkdownHeaderTextSplitter(headers_to_split_on=DEFAULT_MARKDOWN_HEADERS_TO_SPLIT_ON)
     # Splitter to use for HTML splitting
@@ -53,7 +46,7 @@ def __init__(self, config: FileSplitterConfig):
         '''
         self.config = config
         self._extension_map = {
-            '.pdf': self._semantic_chunker_fn,
+            '.pdf': self._recursive_splitter_fn,
             '.md': self._markdown_splitter_fn,
             '.html': self._html_splitter_fn
         }
@@ -88,13 +81,6 @@ def split_documents(self, documents: List[Document], default_failover: bool = Tr
                 split_documents += split_func(document.page_content)
         return split_documents
 
-    def _semantic_chunker_fn(self) -> Callable:
-        # Semantic chunker's split_text returns List[str].
-        def semantic_chunk(content: str) -> List[Document]:
-            chunked_content = self.config.semantic_chunker.split_text(content)
-            return [Document(page_content=c) for c in chunked_content]
-        return semantic_chunk
-
     def _markdown_splitter_fn(self) -> Callable:
         return self.config.markdown_splitter.split_text
 

diff --git a/mindsdb/interfaces/knowledge_base/controller.py b/mindsdb/interfaces/knowledge_base/controller.py
@@ -354,7 +354,7 @@ def add(
 
         if embedding_model is None:
             # create default embedding model
-            model_name = self._create_default_embedding_model(project.name, name)
+            model_name = self._create_default_embedding_model(project.name, name, params=params)
 
             # memorize to remove it later
             params['embedding_model'] = model_name
@@ -425,13 +425,15 @@ def _create_persistent_chroma(self, kb_name, engine="chromadb"):
         self.session.integration_controller.add(vector_store_name, engine, connection_args)
         return vector_store_name
 
-    def _create_default_embedding_model(self, project_name, kb_name, engine="langchain_embedding"):
+    def _create_default_embedding_model(self, project_name, kb_name, engine="langchain_embedding", params: dict = None):
         """create a default embedding model for knowledge base, if not specified"""
         model_name = f"{kb_name}_default_model"
         using_args = {}
         if engine == 'langchain_embedding':
             # Use default embeddings.
             using_args['class'] = 'openai'
+        # Include API key if provided.
+        using_args.update({k: v for k, v in params.items() if 'api_key' in k})
         statement = CreatePredictor(
             name=Identifier(parts=[project_name, model_name]),
             using=using_args,

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -40,8 +40,8 @@ msal
 langchain>=0.1.9
 langchain-core>=0.1.28
 langchain-community
-langchain-experimental
 langchain-openai
 langchain-text_splitters
+bs4 # For web crawler.
 lark
 prometheus-client==0.20.0
diff --git a/tests/integrations/utilities/rag/test_file_splitter.py b/tests/integrations/utilities/rag/test_file_splitter.py
@@ -1,27 +1,24 @@
 from unittest.mock import patch
 
 from langchain_core.documents import Document
-from langchain_experimental.text_splitter import SemanticChunker
 from langchain_text_splitters import MarkdownHeaderTextSplitter, HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter
 from mindsdb.integrations.utilities.rag.splitters.file_splitter import FileSplitter, FileSplitterConfig
 
 
-@patch('mindsdb.integrations.utilities.rag.splitters.file_splitter.OpenAIEmbeddings')
-def test_split_documents_pdf(mock_embeddings):
+def test_split_documents_pdf():
     pdf_doc = Document(
         page_content='This is a test PDF file. Let us try to do some splitting!',
         metadata={'extension': '.pdf'}
     )
-    semantic_chunker = SemanticChunker(mock_embeddings)
-    file_splitter = FileSplitter(FileSplitterConfig(embeddings=mock_embeddings, semantic_chunker=semantic_chunker))
+    recursive_splitter = RecursiveCharacterTextSplitter()
+    file_splitter = FileSplitter(FileSplitterConfig(
+        recursive_splitter=recursive_splitter
+    ))
     split_pdf_docs = file_splitter.split_documents([pdf_doc])
-    assert mock_embeddings.embed_documents.called
-    assert len(mock_embeddings.embed_documents.call_args.args[0]) > 0
     assert len(split_pdf_docs) > 0
 
 
-@patch('mindsdb.integrations.utilities.rag.splitters.file_splitter.OpenAIEmbeddings')
-def test_split_documents_md(mock_embeddings):
+def test_split_documents_md():
     md_content = '''
     # Unit Testing for Dummies
     This MD document covers how to write basic unit tests.
@@ -40,7 +37,6 @@ def test_split_documents_md(mock_embeddings):
     ]
     md_text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
     file_splitter = FileSplitter(FileSplitterConfig(
-        embeddings=mock_embeddings,
         markdown_splitter=md_text_splitter
     ))
     split_md_docs = file_splitter.split_documents([md_doc])
@@ -51,8 +47,7 @@ def test_split_documents_md(mock_embeddings):
     assert 'To be continued!' in split_md_docs[2].page_content
 
 
-@patch('mindsdb.integrations.utilities.rag.splitters.file_splitter.OpenAIEmbeddings')
-def test_split_documents_html(mock_embeddings):
+def test_split_documents_html():
     html_content = '''
 <!DOCTYPE html>
 <html>
@@ -85,7 +80,6 @@ def test_split_documents_html(mock_embeddings):
     ]
     html_text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
     file_splitter = FileSplitter(FileSplitterConfig(
-        embeddings=mock_embeddings,
         html_splitter=html_text_splitter
     ))
     html_doc = Document(
@@ -105,11 +99,9 @@ def test_split_documents_html(mock_embeddings):
     assert 'Some concluding text about Foo' in split_html_docs[7].page_content
 
 
-@patch('mindsdb.integrations.utilities.rag.splitters.file_splitter.OpenAIEmbeddings')
-def test_split_documents_default(mock_embeddings):
+def test_split_documents_default():
     recursive_splitter = RecursiveCharacterTextSplitter()
     file_splitter = FileSplitter(FileSplitterConfig(
-        embeddings=mock_embeddings,
         recursive_splitter=recursive_splitter
     ))
     txt_doc = Document(
@@ -121,9 +113,8 @@ def test_split_documents_default(mock_embeddings):
     assert 'This is a text file!' in split_txt_docs[0].page_content
 
 
-@patch('mindsdb.integrations.utilities.rag.splitters.file_splitter.OpenAIEmbeddings')
 @patch('mindsdb.integrations.utilities.rag.splitters.file_splitter.MarkdownHeaderTextSplitter')
-def test_split_documents_failover(mock_embeddings, mock_md_splitter):
+def test_split_documents_failover(mock_md_splitter):
     md_content = '''
     # Unit Testing for Dummies
     This MD document covers how to write basic unit tests.
@@ -134,7 +125,6 @@ def test_split_documents_failover(mock_embeddings, mock_md_splitter):
 '''
     mock_md_splitter.split_text.side_effect = Exception('Something went wrong!')
     file_splitter = FileSplitter(FileSplitterConfig(
-        embeddings=mock_embeddings,
         markdown_splitter=mock_md_splitter
     ))
     md_doc = Document(
@@ -147,9 +137,8 @@ def test_split_documents_failover(mock_embeddings, mock_md_splitter):
     assert len(split_md_docs) > 0
 
 
-@patch('mindsdb.integrations.utilities.rag.splitters.file_splitter.OpenAIEmbeddings')
 @patch('mindsdb.integrations.utilities.rag.splitters.file_splitter.MarkdownHeaderTextSplitter')
-def test_split_documents_no_failover(mock_embeddings, mock_md_splitter):
+def test_split_documents_no_failover(mock_md_splitter):
     md_content = '''
     # Unit Testing for Dummies
     This MD document covers how to write basic unit tests.
@@ -160,7 +149,6 @@ def test_split_documents_no_failover(mock_embeddings, mock_md_splitter):
 '''
     mock_md_splitter.split_text.side_effect = Exception('Something went wrong!')
     file_splitter = FileSplitter(FileSplitterConfig(
-        embeddings=mock_embeddings,
         markdown_splitter=mock_md_splitter
     ))
     md_doc = Document(

diff --git a/tests/scripts/check_requirements.py b/tests/scripts/check_requirements.py
@@ -39,16 +39,16 @@ def get_requirements_from_file(path):
 # and not explicitly imported in mindsdb.
 MAIN_RULE_IGNORES = {
     "DEP003": ["torch"],
-    # Ignore Langhchain since the requirements check will still fail even if it's conditionally imported for certain features.
     "DEP001": ["torch"],
-    "DEP002": ["psycopg2-binary", "lark"],
+    # bs4 is used by Agents & Knowledge Bases (web handler).
+    "DEP002": ["psycopg2-binary", "lark", "bs4"],
 }
 
 # THe following packages need exceptions because they are optional deps of some other packages. e.g. langchain CAN use openai
 # (pysqlite3 is imported in an unusual way in the chromadb handler and needs to be excluded too)
 # pypdf and openpyxl are optional deps of langchain, that are used for the file handler
 OPTIONAL_HANDLER_DEPS = ["pysqlite3", "torch", "openai", "tiktoken", "wikipedia", "anthropic", "pypdf", "openpyxl",
-                         "sentence-transformers", "faiss-cpu", "litellm"]
+                         "sentence-transformers", "faiss-cpu", "litellm", "chromadb"]
 
 # List of rules we can ignore for specific packages
 # Here we ignore any packages in the main requirements.txt for "listed but not used" errors, because they will be used for the core code but not necessarily in a given handler