# SQL Ingest - Ingest data using SQL query 

In [1]:
# nuclio: ignore
import nuclio

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/mlrun"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/mlrun'


In [3]:
%%nuclio cmd -c 
pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
pip install sqlalchemy==1.3.11
pip install PyMySQL==0.9.3

In [4]:
import pandas as pd
import pyhive
from sqlalchemy.engine import create_engine
from mlrun.execution import MLClientCtx


def sql_ingest(
    context: MLClientCtx,
    sql_query: str,
    database_url: str,
    file_ext: str = "parquet",
) -> None:
    """SQL Ingest - Ingest data using SQL query

    :param context:           the function context
    :param sql_query:         the sql query used to retrieve the data
    :param database_url:      database connection URL
    :param file_ext:          ("parquet") format for result file

"""

    engine = create_engine(database_url)
    df = pd.read_sql(sql_query, engine)

    context.log_dataset('query result',
                        df=df,
                        format=file_ext,
                        artifact_path=context.artifact_subpath('data'))


In [5]:
# nuclio: end-code

### mlconfig

In [6]:
from mlrun import mlconf
import os
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'


### Save function

In [7]:
from mlrun import code_to_function
import os

fn = code_to_function(name="sql_ingest",
                      handler="sql_ingest",
                      description="SQL Ingest - Ingest data using SQL query",
                      categories=["data-prep"],
                      labels={"author": "adih"})

if "V3IO_HOME" in list(os.environ):
    
    !mkdir -p /v3io/${V3IO_HOME}/secrets
    !cp /var/run/iguazio/secrets/* /v3io/${V3IO_HOME}/secrets
    
    from mlrun import mount_v3io
    fn.apply(mount_v3io('tls',
                        'users/'+os.getenv('V3IO_USERNAME')+'/secrets',
                        '/var/run/iguazio/secrets/'))


### Build the image

In [8]:
fn.deploy()

[mlrun] 2020-06-28 07:22:56,344 starting remote build, image: .mlrun/func-default-sql-ingest-latest
[36mINFO[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
[36mINFO[0m[0000] Resolved base name mlrun/mlrun:0.4.10 to mlrun/mlrun:0.4.10 
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
[36mINFO[0m[0000] Built cross stage deps: map[]                
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.10 
[36mINFO[0m[0000] Unpacking rootfs as cmd RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. 
[36mINFO[0m[0026] Taking snapshot of full filesystem...        
[36mINFO[0m[0038] Resolving paths                              
[36mINFO[0m[0045] RUN pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
[36mINFO[0m[0045] cmd: /bin/sh       

True

In [9]:
fn.export('function.yaml')

[mlrun] 2020-06-28 07:25:02,547 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f76724ddf98>

## Test

### Reading from a public MySQL DB

In [10]:
mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'
mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'

In [11]:
from mlrun import NewTask, run_local

sql_task = NewTask(name='sql',
                   handler=sql_ingest,
                   params={'sql_query': mysql_query,
                           'database_url': mysql_url})


In [12]:
sql_func = run_local(sql_task)

[mlrun] 2020-06-28 07:25:19,774 starting run sql uid=e4066d93fc164614a770ab65526d2b5b  -> http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...526d2b5b,0,Jun 28 07:25:19,completed,sql,v3io_user=adminkind=handlerowner=adminhost=jupyter-b9c7995f9-z8fvh,,"sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM familydatabase_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",,query result


to track results use .show() or .logs() or in CLI: 
!mlrun get run e4066d93fc164614a770ab65526d2b5b --project default , !mlrun logs e4066d93fc164614a770ab65526d2b5b --project default
[mlrun] 2020-06-28 07:25:23,231 run executed, status=completed


#### Run it on a cluster

In [13]:
fn.run(sql_task)

[mlrun] 2020-06-28 07:25:29,956 starting run sql uid=e8f5c41abd694181b410fc3c5910e8ba  -> http://mlrun-api:8080
[mlrun] 2020-06-28 07:25:30,305 Job is running in the background, pod: sql-x6sjp
[mlrun] 2020-06-28 07:25:38,126 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...5910e8ba,0,Jun 28 07:25:35,completed,sql,v3io_user=adminkind=jobowner=adminhost=sql-x6sjp,,"sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM familydatabase_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",,query result


to track results use .show() or .logs() or in CLI: 
!mlrun get run e8f5c41abd694181b410fc3c5910e8ba --project default , !mlrun logs e8f5c41abd694181b410fc3c5910e8ba --project default
[mlrun] 2020-06-28 07:25:39,560 run executed, status=completed


<mlrun.model.RunObject at 0x7f76724f1cf8>

### SQL query from Iguazio Key Value via Presto

You need to create a table and set the sql_table path accordingly. <br>
you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb

In [14]:
# nuclio: ignore
import os
sql_table = os.path.join('v3io.users."'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab"')
sql_query_string = 'select * from '+sql_table+""

In [15]:
%sql select * from $sql_table limit 10

Done.


securitydesc,securitytype,time,isin,minprice,date,endprice,numberoftrades,mnemonic,currency,securityid,maxprice,tradedvolume,startprice
UBS I.ETF-DL G.SEL.DIV.AD,ETF,08:27,IE00BMP3HG27,8.418,2018-03-26 00:00:00.000,8.418,1,UBUM,EUR,2505450,8.418,403,8.418
"GILEAD SCIENCES DL-,001",Common stock,08:00,US3755581036,59.7,2018-03-26 00:00:00.000,59.84,3,GIS,EUR,2506495,59.84,745,59.7
"3M CO. DL-,01",Common stock,08:00,US88579Y1010,176.51,2018-03-26 00:00:00.000,176.51,1,MMM,EUR,2506577,176.51,39,176.51
DIEBOLD NIXDORF INH.O.N.,Common stock,08:06,DE000A0CAYB2,66.3,2018-03-26 00:00:00.000,66.3,1,WIN,EUR,2504286,66.3,60,66.3
XTR.II EUR.INF.LINK.BD 1C,ETF,08:13,LU0290358224,218.97,2018-03-26 00:00:00.000,218.97,1,DBXK,EUR,2505840,218.97,110,218.97
UBS-ETF-MSCI EMU S.C.EOAD,ETF,08:33,LU0671493277,100.2,2018-03-26 00:00:00.000,100.2,1,UEFD,EUR,2506045,100.2,180,100.2
ASMALLWORLD AG SF 1,Common stock,08:23,CH0404880129,12.7,2018-03-26 00:00:00.000,12.7,1,1Q7,EUR,3089122,12.7,400,12.7
IS.DJ GLOB.TITAN.50 U.ETF,ETF,08:42,DE0006289382,31.25,2018-03-26 00:00:00.000,31.25,1,EXI2,EUR,2505029,31.25,50,31.25
ISHS IV-AGEING POPUL.ETF,ETF,08:17,IE00BYZK4669,4.926,2018-03-26 00:00:00.000,4.926,1,2B77,EUR,2505552,4.926,25,4.926
PORSCHE AUTOM.HLDG VZO,Common stock,08:00,DE000PAH0038,64.68,2018-03-26 00:00:00.000,64.76,8,PAH3,EUR,2504816,64.76,698,64.7


In [16]:
sql_task = NewTask(name='sql', 
                   handler=sql_ingest,
                   params={'sql_query': sql_query_string,
                          'database_url': os.getenv('DATABASE_URL')}
                          )


In [17]:
sql_func = run_local(sql_task)

[mlrun] 2020-06-28 07:27:46,166 starting run sql uid=17a946cc923d469fbd0881629553eca0  -> http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...9553eca0,0,Jun 28 07:27:46,completed,sql,v3io_user=adminkind=handlerowner=adminhost=jupyter-b9c7995f9-z8fvh,,"sql_query=select * from v3io.users.""admin/examples/stocks_tab""database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D",,query result


to track results use .show() or .logs() or in CLI: 
!mlrun get run 17a946cc923d469fbd0881629553eca0 --project default , !mlrun logs 17a946cc923d469fbd0881629553eca0 --project default
[mlrun] 2020-06-28 07:27:49,867 run executed, status=completed


In [18]:
fn.run(sql_task)

[mlrun] 2020-06-28 07:27:54,526 starting run sql uid=674bad94883040f8beb9429058d944e9  -> http://mlrun-api:8080
[mlrun] 2020-06-28 07:27:54,829 Job is running in the background, pod: sql-d4792
[mlrun] 2020-06-28 07:28:03,227 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...58d944e9,0,Jun 28 07:27:59,completed,sql,v3io_user=adminkind=jobowner=adminhost=sql-d4792,,"sql_query=select * from v3io.users.""admin/examples/stocks_tab""database_url=presto://admin:8278ee8e-0f31-4aea-a105-2eab202bec93@presto-api-presto.default-tenant.app.cs-mlrun-test.iguazio-c0.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D",,query result


to track results use .show() or .logs() or in CLI: 
!mlrun get run 674bad94883040f8beb9429058d944e9 --project default , !mlrun logs 674bad94883040f8beb9429058d944e9 --project default
[mlrun] 2020-06-28 07:28:07,230 run executed, status=completed


<mlrun.model.RunObject at 0x7f76624f85c0>