# SQL Ingest - Ingest data using SQL query 

In [1]:
# nuclio: ignore
import nuclio

ModuleNotFoundError: No module named 'nuclio'

In [2]:
%nuclio config kind = "job"
%nuclio config spec.build.baseImage = "mlrun/mlrun"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/mlrun'


In [3]:
%%nuclio cmd -c
pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
pip install sqlalchemy==1.3.11
pip install PyMySQL==0.9.3

In [4]:
import pandas as pd
import pyhive
from sqlalchemy.engine import create_engine
from mlrun.execution import MLClientCtx


def sql_to_file(
    context: MLClientCtx,
    sql_query: str,
    database_url: str,
    file_ext: str = "parquet",
) -> None:
    """SQL Ingest - Ingest data using SQL query

    :param context:           the function context
    :param sql_query:         the sql query used to retrieve the data
    :param database_url:      database connection URL
    :param file_ext:          ("parquet") format for result file

"""

    engine = create_engine(database_url)
    df = pd.read_sql(sql_query, engine)

    context.log_dataset('query result',
                        df=df,
                        format=file_ext,
                        artifact_path=context.artifact_subpath('data'))


In [5]:
# nuclio: end-code

### mlconfig

In [7]:
from mlrun import mlconf
import os
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'


KeyError: 'HOME'

### Save function

In [7]:
def mount_secret(
    secret_name, volume_mount_path, volume_name='secret', items=None
):
    def _mount_secret(task):
        from kubernetes import client as k8s_client
        vol = k8s_client.V1SecretVolumeSource(secret_name=secret_name, items=items)
        return task.add_volume(
            k8s_client.V1Volume(name=volume_name, secret=vol)
        ).add_volume_mount(
            k8s_client.V1VolumeMount(mount_path=volume_mount_path, name=volume_name)
        )
    return _mount_secret

In [9]:
from mlrun import code_to_function, NewTask
import os

fn = code_to_function(name="sql_to_file",
                      handler="sql_to_file",
                      description="SQL To File - Ingest data using SQL query",
                      categories=["data-prep"],
                      labels={"author": "adih"})

if "V3IO_ACCESS_KEY" in list(os.environ):
    fn.apply(mount_secret(secret_name='presto-tls',
                        volume_mount_path= '/var/run/iguazio/secrets/'))


### Build the image

In [9]:
fn.deploy()

[mlrun] 2020-06-29 12:42:44,100 starting remote build, image: .mlrun/func-default-sql-ingest-latest
[36mINFO[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
[36mINFO[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
[36mINFO[0m[0000] Built cross stage deps: map[]                
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
[36mINFO[0m[0001] Unpacking rootfs as cmd RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. 
[36mINFO[0m[0027] Taking snapshot of full filesystem...        
[36mINFO[0m[0039] Resolving paths                              
[36mINFO[0m[0046] RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
[36mINFO[0m[0046] cmd: /bin/sh       

True

In [10]:
fn.export('function.yaml')

[mlrun] 2020-06-30 01:58:41,604 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x2239dbf01c8>

## Test

### Reading from a public MySQL DB

In [11]:
mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'
mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'

In [12]:
from mlrun import NewTask, run_local

sql_task = NewTask(name='sql',
                   handler=sql_to_file,
                   params={'sql_query': mysql_query,
                           'database_url': mysql_url})


In [13]:
sql_func = run_local(sql_task)

[mlrun] 2020-06-29 12:43:59,253 starting run sql uid=b0914edaa58e45ee97c132200c6b60be  -> http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...0c6b60be,0,Jun 29 12:43:59,completed,sql,v3io_user=adminkind=handlerowner=adminhost=jupyter-b9c7995f9-4fblj,,"sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM familydatabase_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",,query result


to track results use .show() or .logs() or in CLI: 
!mlrun get run b0914edaa58e45ee97c132200c6b60be --project default , !mlrun logs b0914edaa58e45ee97c132200c6b60be --project default
[mlrun] 2020-06-29 12:44:02,344 run executed, status=completed


#### Run it on a cluster

In [14]:
fn.run(sql_task)

[mlrun] 2020-06-29 12:44:02,350 starting run sql uid=46ff7ef67e314be49353982cdd8d073a  -> http://mlrun-api:8080
[mlrun] 2020-06-29 12:44:02,622 Job is running in the background, pod: sql-mplpz
[mlrun] 2020-06-29 12:44:09,070 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...dd8d073a,0,Jun 29 12:44:06,completed,sql,v3io_user=adminkind=jobowner=adminhost=sql-mplpz,,"sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM familydatabase_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",,query result


to track results use .show() or .logs() or in CLI: 
!mlrun get run 46ff7ef67e314be49353982cdd8d073a --project default , !mlrun logs 46ff7ef67e314be49353982cdd8d073a --project default
[mlrun] 2020-06-29 12:44:11,893 run executed, status=completed


<mlrun.model.RunObject at 0x7f87fba74b00>

### SQL query from Iguazio Key Value via Presto

You need to create a table and set the sql_table path accordingly. <br>
you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb

In [15]:
# nuclio: ignore
import os
sql_table = os.path.join('v3io.users."'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab"')
sql_query_string = 'select * from '+sql_table+""

In [16]:
%sql select * from $sql_table limit 10

Done.


securitydesc,securitytype,time,isin,minprice,date,endprice,numberoftrades,mnemonic,currency,securityid,maxprice,tradedvolume,startprice
UBS I.ETF-DL G.SEL.DIV.AD,ETF,08:27,IE00BMP3HG27,8.418,2018-03-26 00:00:00.000,8.418,1,UBUM,EUR,2505450,8.418,403,8.418
"GILEAD SCIENCES DL-,001",Common stock,08:00,US3755581036,59.7,2018-03-26 00:00:00.000,59.84,3,GIS,EUR,2506495,59.84,745,59.7
"3M CO. DL-,01",Common stock,08:00,US88579Y1010,176.51,2018-03-26 00:00:00.000,176.51,1,MMM,EUR,2506577,176.51,39,176.51
DIEBOLD NIXDORF INH.O.N.,Common stock,08:06,DE000A0CAYB2,66.3,2018-03-26 00:00:00.000,66.3,1,WIN,EUR,2504286,66.3,60,66.3
XTR.II EUR.INF.LINK.BD 1C,ETF,08:13,LU0290358224,218.97,2018-03-26 00:00:00.000,218.97,1,DBXK,EUR,2505840,218.97,110,218.97
UBS-ETF-MSCI EMU S.C.EOAD,ETF,08:33,LU0671493277,100.2,2018-03-26 00:00:00.000,100.2,1,UEFD,EUR,2506045,100.2,180,100.2
ASMALLWORLD AG SF 1,Common stock,08:23,CH0404880129,12.7,2018-03-26 00:00:00.000,12.7,1,1Q7,EUR,3089122,12.7,400,12.7
IS.DJ GLOB.TITAN.50 U.ETF,ETF,08:42,DE0006289382,31.25,2018-03-26 00:00:00.000,31.25,1,EXI2,EUR,2505029,31.25,50,31.25
ISHS IV-AGEING POPUL.ETF,ETF,08:17,IE00BYZK4669,4.926,2018-03-26 00:00:00.000,4.926,1,2B77,EUR,2505552,4.926,25,4.926
PORSCHE AUTOM.HLDG VZO,Common stock,08:00,DE000PAH0038,64.68,2018-03-26 00:00:00.000,64.76,8,PAH3,EUR,2504816,64.76,698,64.7


In [17]:
sql_task = NewTask(name='sql', 
                   handler=sql_to_file,
                   params={'sql_query': sql_query_string,
                          'database_url': os.getenv('DATABASE_URL')}
                          )


In [18]:
sql_func = run_local(sql_task)

[mlrun] 2020-06-29 12:44:14,406 starting run sql uid=d32a57bb990d4142bb1f63862e8906bf  -> http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...2e8906bf,0,Jun 29 12:44:14,completed,sql,v3io_user=adminkind=handlerowner=adminhost=jupyter-b9c7995f9-4fblj,,"sql_query=select * from v3io.users.""admin/examples/stocks_tab""database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D",,query result


to track results use .show() or .logs() or in CLI: 
!mlrun get run d32a57bb990d4142bb1f63862e8906bf --project default , !mlrun logs d32a57bb990d4142bb1f63862e8906bf --project default
[mlrun] 2020-06-29 12:44:18,102 run executed, status=completed


In [19]:
fn.run(sql_task)

[mlrun] 2020-06-29 12:44:18,112 starting run sql uid=db9507007f6d452e9ca020e4f483e33b  -> http://mlrun-api:8080
[mlrun] 2020-06-29 12:44:18,387 Job is running in the background, pod: sql-g7p4f
[mlrun] 2020-06-29 12:44:25,033 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...f483e33b,0,Jun 29 12:44:21,completed,sql,v3io_user=adminkind=jobowner=adminhost=sql-g7p4f,,"sql_query=select * from v3io.users.""admin/examples/stocks_tab""database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D",,query result


to track results use .show() or .logs() or in CLI: 
!mlrun get run db9507007f6d452e9ca020e4f483e33b --project default , !mlrun logs db9507007f6d452e9ca020e4f483e33b --project default
[mlrun] 2020-06-29 12:44:27,645 run executed, status=completed


<mlrun.model.RunObject at 0x7f87f8e26c18>