# Run SQL 

### Set project's configuration

In [1]:
# nuclio: ignore
from mlrun import new_project, code_to_function
project_name='sql'
project_dir = './'
sqlproj = new_project(project_name, project_dir)

### MLRun config

In [2]:
# nuclio: ignore
from os import path
from mlrun import run_local, NewTask, mlconf, code_to_function, mount_v3io
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

# specify paths abd artifacts target location
artifact_path = path.abspath('jobs')

In [3]:
# nuclio: ignore
import nuclio

In [4]:
%nuclio config spec.image = "mlrun/mlrun"

%nuclio: setting spec.image to 'mlrun/mlrun'


In [5]:
# nuclio: ignore
# copy the secret files neeeded for Presto to v3io directory
# The secrete files are used to store the credentials for the presto session  

!mkdir -p /v3io/${V3IO_HOME}/secrets
!cp /var/run/iguazio/secrets/* /v3io/${V3IO_HOME}/secrets

In [6]:
%%nuclio cmd -c
pip install pandas==1.0.1
pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999
pip install requests==2.23.0
pip install sqlalchemy==1.3.11
pip install PyMySQL==0.9.3

In [7]:
import pandas as pd 
from os import path
import pyhive
import time
import datetime
from sqlalchemy.engine import create_engine 

def sql(context, sql_query, database_url):
      
    target_path = path.join(context.artifact_path, 'data')
    engine = create_engine (database_url)
    df = pd.read_sql(sql_query,engine)
     
    context.log_dataset('query result', df=df, artifact_path=target_path)


In [8]:
# nuclio: end-code

### Save function

In [9]:
fn = code_to_function(kind='job', name='run_sql').apply(mount_v3io()).apply(mount_v3io('v3', 'users/adi/secrets', '/var/run/iguazio/secrets/'))
fn.spec.description = "Running SQL using presto and store it as a csv file"
fn.metadata.categories =  ["data-prep"]
fn.metadata.labels = {'author': 'adih'}
fn.export('function.yaml')

[mlrun] 2020-06-15 19:00:33,649 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f9ff2157ba8>

### Build the image

In [10]:
fn.deploy()

[mlrun] 2020-06-15 19:00:35,007 starting remote build, image: .mlrun/func-default-run-sql-latest
[36mINFO[0m[0000] Resolved base name mlrun/mlrun:0.4.7 to mlrun/mlrun:0.4.7 
[36mINFO[0m[0000] Resolved base name mlrun/mlrun:0.4.7 to mlrun/mlrun:0.4.7 
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.7  
[36mINFO[0m[0002] Retrieving image manifest mlrun/mlrun:0.4.7  
[36mINFO[0m[0004] Built cross stage deps: map[]                
[36mINFO[0m[0004] Retrieving image manifest mlrun/mlrun:0.4.7  
[36mINFO[0m[0005] Retrieving image manifest mlrun/mlrun:0.4.7  
[36mINFO[0m[0006] Unpacking rootfs as cmd RUN pip install pandas==1.0.1 requires it. 
[36mINFO[0m[0047] Taking snapshot of full filesystem...        
[36mINFO[0m[0052] Resolving paths                              
[36mINFO[0m[0058] RUN pip install pandas==1.0.1                
[36mINFO[0m[0058] cmd: /bin/sh                                 
[36mINFO[0m[0058] args: [-c pip install pandas==1.0.1]       

True

### Test

In [12]:
sqlproj.set_function(fn)

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f9ff2157ba8>

### Reading from a public MySQL DB

In [13]:
db_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'
sql_query_string = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'

In [14]:
sql_task = NewTask(name='sql', 
                            handler=sql,
                   params={'sql_query': sql_query_string,
                           'database_url': db_url},
                   artifact_path=artifact_path
                  ) 

In [15]:
sql_func = run_local(sql_task)

[mlrun] 2020-06-15 19:04:12,418 starting run sql uid=6da3d61c2efb452ca2e97a8396293785  -> http://10.193.140.11:8080
[mlrun] 2020-06-15 19:04:15,052 log artifact query result at /User/adi/jobs/data/query result.csv, size: 254798, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...96293785,0,Jun 15 19:04:12,completed,sql,v3io_user=adikind=handlerowner=adihost=jupyter-adi-9bff7b6f-k7rsd,,"sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM familydatabase_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",,query result


to track results use .show() or .logs() or in CLI: 
!mlrun get run 6da3d61c2efb452ca2e97a8396293785 --project default , !mlrun logs 6da3d61c2efb452ca2e97a8396293785 --project default
[mlrun] 2020-06-15 19:04:15,121 run executed, status=completed


#### Run it on a cluster

In [16]:
sqlproj.func('run-sql').apply(mount_v3io()).run(sql_task)

[mlrun] 2020-06-15 19:04:17,831 starting run sql uid=5b04880f29304cc0a1bd8ae7886a61e2  -> http://10.193.140.11:8080
[mlrun] 2020-06-15 19:04:17,943 Job is running in the background, pod: sql-lkb9z
[mlrun] 2020-06-15 19:04:27,897 starting local run: main.py # sql
[mlrun] 2020-06-15 19:04:30,210 log artifact query result at /User/adi/jobs/data/query result.csv, size: 254798, db: Y

[mlrun] 2020-06-15 19:04:30,238 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
sql,...886a61e2,0,Jun 15 19:04:27,completed,sql,host=sql-lkb9zkind=jobowner=adiv3io_user=adi,,"database_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfamsql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family",,query result


to track results use .show() or .logs() or in CLI: 
!mlrun get run 5b04880f29304cc0a1bd8ae7886a61e2 --project sql , !mlrun logs 5b04880f29304cc0a1bd8ae7886a61e2 --project sql
[mlrun] 2020-06-15 19:04:37,238 run executed, status=completed


<mlrun.model.RunObject at 0x7fa0229bef98>

### Run SQL from Iguazio Key Value

You need to create a table and set th sql_table path accordingly. <br>
you can find an example of creating such table in https://github.com/v3io/tutorials/blob/development/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb

In [17]:
# nuclio: ignore
import os
sql_table = os.path.join('v3io.users."'+str(os.getenv('V3IO_USERNAME'))+'/examples/stocks_tab"')
sql_query_string = 'select * from '+sql_table+""

In [18]:
%sql select * from $sql_table limit 10

Done.


securitydesc,securitytype,time,isin,minprice,date,endprice,numberoftrades,mnemonic,currency,securityid,maxprice,tradedvolume,startprice
"TESLA INC. DL -,001",Common stock,08:00,US88160R1014,246.31,2018-03-26 00:00:00.000,247.0,3,TL0,EUR,2506575,247.0,201,246.81
BRENNTAG AG NA O.N.,Common stock,08:00,DE000A1DAHH0,48.14,2018-03-26 00:00:00.000,48.14,2,BNR,EUR,2504453,48.14,185,48.14
UNIPER SE NA O.N. BAR,Common stock,08:34,DE000UNSE1V6,21.8,2018-03-26 00:00:00.000,21.8,1,UN02,EUR,2774277,21.8,15,21.8
HANNOVER RUECK SE NA O.N.,Common stock,08:00,DE0008402215,107.2,2018-03-26 00:00:00.000,107.2,1,HNR1,EUR,2505132,107.2,5,107.2
LYXOR MSCI E.M.U.ETF C-EO,ETF,08:04,FR0010429068,10.4135,2018-03-26 00:00:00.000,10.4135,1,LYM7,EUR,2505242,10.4135,34,10.4135
HAMBORNER REIT AG O.N.,Common stock,08:00,DE0006013006,9.32,2018-03-26 00:00:00.000,9.32,1,HAB,EUR,2504998,9.32,173,9.32
DRAEGERWERK VZO O.N.,Common stock,08:01,DE0005550636,77.7,2018-03-26 00:00:00.000,77.7,3,DRW3,EUR,2504952,77.85,235,77.85
MAGFORCE AG,Common stock,08:08,DE000A0HGQF5,5.82,2018-03-26 00:00:00.000,5.87,4,MF6,EUR,2504308,5.87,2000,5.87
DEUTSCHE EUROSHOP NA O.N.,Common stock,08:00,DE0007480204,29.04,2018-03-26 00:00:00.000,29.04,7,DEQ,EUR,2505102,29.04,1492,29.04
IS.S.E.600 HEA.C.U.ETF A.,ETF,08:17,DE000A0Q4R36,67.44,2018-03-26 00:00:00.000,67.44,1,EXV4,EUR,2504394,67.44,50,67.44


In [19]:
db_url=os.getenv('DATABASE_URL')

In [20]:
env DATABASE_URL

'presto://adi:6fa83107-07f1-499f-ac8f-3abd14d2c31f@presto-api-presto.default-tenant.app.dev34.lab.iguazeng.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D'

In [21]:
sql_task = NewTask(name='sql', 
                            handler=sql,
                   params={'sql_query': sql_query_string,
                           'database_url': db_url},
                   artifact_path=artifact_path
                  ) 

In [22]:
sql_func = run_local(sql_task)

[mlrun] 2020-06-15 19:04:45,594 starting run sql uid=c176f03818a44748a0242c030424941c  -> http://10.193.140.11:8080
[mlrun] 2020-06-15 19:04:49,010 log artifact query result at /User/adi/jobs/data/query result.csv, size: 92587, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...0424941c,0,Jun 15 19:04:45,completed,sql,v3io_user=adikind=handlerowner=adihost=jupyter-adi-9bff7b6f-k7rsd,,"sql_query=select * from v3io.users.""adi/examples/stocks_tab""database_url=presto://adi:6fa83107-07f1-499f-ac8f-3abd14d2c31f@presto-api-presto.default-tenant.app.dev34.lab.iguazeng.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7D",,query result


to track results use .show() or .logs() or in CLI: 
!mlrun get run c176f03818a44748a0242c030424941c --project default , !mlrun logs c176f03818a44748a0242c030424941c --project default
[mlrun] 2020-06-15 19:04:49,084 run executed, status=completed


In [23]:
sqlproj.func('run-sql').apply(mount_v3io()).run(sql_task)

[mlrun] 2020-06-15 19:04:49,107 starting run sql uid=b90e3c6131a44bf4beefc8c5546d8acd  -> http://10.193.140.11:8080
[mlrun] 2020-06-15 19:04:49,233 Job is running in the background, pod: sql-gkxt8
[mlrun] 2020-06-15 19:04:55,452 starting local run: main.py # sql
[mlrun] 2020-06-15 19:04:59,166 log artifact query result at /User/adi/jobs/data/query result.csv, size: 92587, db: Y

[mlrun] 2020-06-15 19:04:59,206 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
sql,...546d8acd,0,Jun 15 19:04:55,completed,sql,host=sql-gkxt8kind=jobowner=adiv3io_user=adi,,"database_url=presto://adi:6fa83107-07f1-499f-ac8f-3abd14d2c31f@presto-api-presto.default-tenant.app.dev34.lab.iguazeng.com:443/v3io?protocol=https&requests_kwargs=%7B%22verify%22%3A+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22cert%22%3A+%5B%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.crt%22%2C+%22%2Fvar%2Frun%2Figuazio%2Fsecrets%2Ftls.key%22%5D%7Dsql_query=select * from v3io.users.""adi/examples/stocks_tab""",,query result


to track results use .show() or .logs() or in CLI: 
!mlrun get run b90e3c6131a44bf4beefc8c5546d8acd --project sql , !mlrun logs b90e3c6131a44bf4beefc8c5546d8acd --project sql
[mlrun] 2020-06-15 19:05:01,578 run executed, status=completed


<mlrun.model.RunObject at 0x7fa0211e0a90>