mlrun · davesh0812 · Mar 10, 2022 · Apr 7, 2022 · Apr 7, 2022 · Apr 11, 2022
diff --git a/mlrun/datastore/sources.py b/mlrun/datastore/sources.py
@@ -18,6 +18,7 @@
 from datetime import datetime
 from typing import Dict, List, Optional, Union
 
+import pandas as pd
 import v3io
 import v3io.dataplane
 from nuclio import KafkaTrigger
@@ -49,6 +50,29 @@ def get_source_step(source, key_fields=None, time_field=None, context=None):
     return source.to_step(key_fields, time_field, context)
 
 
+class _SqlDBIterator:
+    def __init__(self, collection, iter_chunksize):
+        """
+        Iterate over given Sql collection
+
+        :param iter_chunksize:   number of rows per chunk
+        :param collection:  sql collection
+        """
+        self.collection = collection
+        self.iter_chunksize = iter_chunksize
+        self.keys = self.collection.keys()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        chunk = self.collection.fetchmany(self.iter_chunksize)
+        if len(chunk) != 0:
+            return pd.DataFrame(chunk, columns=self.keys)
+        else:
+            raise StopIteration
+
+
 class BaseSourceDriver(DataSource):
     support_spark = False
     support_storey = False
@@ -826,6 +850,124 @@ def add_nuclio_trigger(self, function):
         return func
 
 
+class SqlDBSource(BaseSourceDriver):
+
+    kind = "sqldb"
+    support_storey = True
+    support_spark = False
+    _SQL_DB_PATH_STRING_ENV_VAR = "SQL_DB_PATH_STRING"
+
+    def __init__(
+        self,
+        name: str = "",
+        chunksize: int = None,
+        key_field: str = None,
+        time_field: str = None,
+        schedule: str = None,
+        start_time: Optional[Union[datetime, str]] = None,
+        end_time: Optional[Union[datetime, str]] = None,
+        db_path: str = None,
+        collection_name: str = None,
+        spark_options: dict = None,
+    ):
+        """
+        Reads SqlDB as input source for a flow.
+
+        example::
+            db_path = "sqlite:///stockmarket.db"
+            source = SqlDBSource(
+                collection_name='source_name', db_path=self.db, key_field='key'
+            )
+
+        :param name:            source name
+        :param chunksize:       number of rows per chunk (default large single chunk)
+        :param key_field:       the column to be used as the key for the collection.
+        :param time_field:      the column to be parsed as the timestamp for events. Defaults to None
+        :param start_time:      filters out data before this time
+        :param end_time:        filters out data after this time
+        :param schedule:        string to configure scheduling of the ingestion job. For example '*/30 * * * *' will
+                                    cause the job to run every 30 minutes
+        :param db_path:             url string connection to sql database.
+                                    If not set, the SQL_DB_PATH_STRING environment variable will be used.
+        :param collection_name: the name of the collection to access,
+                                    from the current database
+        :param spark_options:   additional spark read options
+        """
+
+        db_path = db_path or os.getenv(self._SQL_DB_PATH_STRING_ENV_VAR)
+        if db_path is None:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                f"cannot specify without db_path arg or secret {self._SQL_DB_PATH_STRING_ENV_VAR}"
+            )
+        attrs = {
+            "chunksize": chunksize,
+            "spark_options": spark_options,
+            "collection_name": collection_name,
+            "db_path": db_path,
+        }
+        attrs = {key: value for key, value in attrs.items() if value is not None}
+        super().__init__(
+            name,
+            attributes=attrs,
+            key_field=key_field,
+            time_field=time_field,
+            schedule=schedule,
+            start_time=start_time,
+            end_time=end_time,
+        )
+
+    def to_dataframe(self):
+        import sqlalchemy as db
+
+        query = self.attributes.get("query")
+        db_path = self.attributes.get("db_path")
+        collection_name = self.attributes.get("collection_name")
+        chunksize = self.attributes.get("chunksize")
+        if collection_name and db_path:
+            engine = db.create_engine(db_path)
+            metadata = db.MetaData()
+            connection = engine.connect()
+            collection = db.Table(
+                collection_name, metadata, autoload=True, autoload_with=engine
+            )
+            results = connection.execute(db.select([collection]))
+            if chunksize:
+                return _SqlDBIterator(
+                    collection=results, iter_chunksize=chunksize, iter_query=query
+                )
+            else:
+                results = results.fetchall()
+                df = pd.DataFrame(results)
+                df.columns = results[0].keys()
+                connection.close()
+                return df
+        else:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "collection_name and db_name args must be specified"
+            )
+
+    def to_step(self, key_field=None, time_field=None, context=None):
+        from mlrun.datastore.storeySourse import SqlDBSourceStorey
+
+        attributes = self.attributes or {}
+        if context:
+            attributes["context"] = context
+
+        return SqlDBSourceStorey(
+            key_field=self.key_field or key_field,
+            time_field=self.time_field or time_field,
+            # storage_options=self._get_store().get_storage_options(),
+            end_filter=self.end_time,
+            start_filter=self.start_time,
+            filter_column=self.time_field or time_field,
+            **attributes,
+        )
+        pass
+
+    def is_iterator(self):
+        return True if self.attributes.get("chunksize") else False
+
+
 # map of sources (exclude DF source which is not serializable)
 source_kind_to_driver = {
     "": BaseSourceDriver,
@@ -837,4 +979,5 @@ def add_nuclio_trigger(self, function):
     CustomSource.kind: CustomSource,
     BigQuerySource.kind: BigQuerySource,
     SnowflakeSource.kind: SnowflakeSource,
+    SqlDBSource.kind: SqlDBSource,
 }
diff --git a/mlrun/datastore/storeyDriver.py b/mlrun/datastore/storeyDriver.py
@@ -0,0 +1,136 @@
+# Copyright 2018 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import storey
+
+
+class SqlDBDriver(storey.Driver):
+    """
+    Database connection to Sql data basw.
+    :param db_path: url string connection to sql database.
+    :param primary_key: the primary key of the collection.
+    """
+
+    def __init__(
+        self,
+        primary_key: str,
+        db_path: str,
+        aggregation_attribute_prefix: str = "aggr_",
+        aggregation_time_attribute_prefix: str = "_",
+    ):
+        self._db_path = db_path
+        self._sql_connection = None
+        self._primary_key = primary_key
+        self._mtime_name = "$_mtime_"
+        self._storey_key = "storey_key"
+
+        self._aggregation_attribute_prefix = aggregation_attribute_prefix
+        self._aggregation_time_attribute_prefix = aggregation_time_attribute_prefix
+
+    def _lazy_init(self):
+        import sqlalchemy as db
+
+        self._closed = False
+        if not self._sql_connection:
+            self._engine = db.create_engine(self._db_path)
+            self._sql_connection = self._engine.connect()
+
+    def collection(self, table_path):
+        import sqlalchemy as db
+
+        metadata = db.MetaData()
+        return db.Table(
+            table_path[3:].split("/")[1],
+            metadata,
+            autoload=True,
+            autoload_with=self._engine,
+        )
+
+    async def _save_schema(self, container, table_path, schema):
+        self._lazy_init()
+        return None
+
+    async def _load_schema(self, container, table_path):
+        self._lazy_init()
+        return None
+
+    async def _save_key(
+        self, container, table_path, key, aggr_item, partitioned_by_key, additional_data
+    ):
+        import sqlalchemy as db
+
+        self._lazy_init()
+
+        collection = self.collection(table_path)
+        return_val = None
+        try:
+            return_val = self._sql_connection.execute(
+                collection.insert(), [additional_data]
+            )
+        except db.exc.IntegrityError:
+            pass
+        return return_val
+
+    async def _load_aggregates_by_key(self, container, table_path, key):
+        self._lazy_init()
+        collection = self.collection(table_path)
+        try:
+            agg_val, values = await self._get_all_fields(key, collection)
+            if not agg_val:
+                agg_val = None
+            if not values:
+                values = None
+            return [agg_val, values]
+        except Exception:
+            return [None, None]
+
+    async def _load_by_key(self, container, table_path, key, attribute):
+        self._lazy_init()
+        collection = self.collection(table_path)
+        if attribute == "*":
+            _, values = await self._get_all_fields(key, collection)
+        else:
+            values = None
+        return values
+
+    async def close(self):
+        pass
+
+    async def _get_all_fields(self, key, collection):
+
+        try:
+            my_query = f"SELECT * FROM {collection} where {self._primary_key}={key}"
+            results = self._sql_connection.execute(my_query).fetchall()
+        except Exception as e:
+            raise RuntimeError(f"Failed to get key {key}. Response error was: {e}")
+
+        return None, {
+            results[0]._fields[i]: results[0][i] for i in range(len(results[0]))
+        }
+
+    async def _get_specific_fields(self, key: str, collection, attributes: List[str]):
+        try:
+            my_query = f"SELECT {','.join(attributes)} FROM {collection} where {self._primary_key}={key}"
+            results = self._sql_connection.execute(my_query).fetchall()
+        except Exception as e:
+            raise RuntimeError(f"Failed to get key {key}. Response error was: {e}")
+
+        return None, {
+            results[0]._fields[i]: results[0][i] for i in range(len(results[0]))
+        }
+
+    def supports_aggregations(self):
+        return False