# SQL Ingest - Ingest data using SQL query 

In [37]:
import mlrun
# Set the base project name
project_name_base = 'functions'

# Initialize the MLRun project object
project = mlrun.get_or_create_project(project_name_base, context="./", user_project=False)

print(f'Project name: {project.metadata.name}')

> 2022-03-04 20:29:28,680 [info] loaded project functions from MLRun DB
Project name: functions


In [38]:
# pip install --no-cache-dir git+https://github.com/v3io/PyHive.git@v0.6.999 
# pip install sqlalchemy==1.3.11
# pip install PyMySQL==0.9.3

In [39]:
%%writefile sql_to_file.py

import pandas as pd
import pyhive
from sqlalchemy.engine import create_engine
from mlrun.execution import MLClientCtx


def sql_to_file(
    context: MLClientCtx,
    sql_query: str,
    database_url: str,
    file_ext: str = "parquet",
) -> None:
    """SQL Ingest - Ingest data using SQL query

    :param context:           the function context
    :param sql_query:         the sql query used to retrieve the data
    :param database_url:      database connection URL
    :param file_ext:          ("parquet") format for result file

"""

    engine = create_engine(database_url)
    df = pd.read_sql(sql_query, engine)

    context.log_dataset('query result',
                        df=df,
                        format=file_ext,
                        artifact_path=context.artifact_subpath('data'))


Overwriting sql_to_file.py


### Save function

In [40]:
image='.mlrun/sqltofile'

In [42]:
from mlrun import code_to_function, NewTask
import os

fn = code_to_function(name="sql_to_file",
                      kind="job",
                      filename="sql_to_file.py",
                      handler="sql_to_file",
                      description="SQL To File - Ingest data using SQL query",
                      image="mlrun/mlrun", 
                      requirements=['pyhive', 'sqlalchemy', 'PyMySQL'],
                      categories=["data-prep"],
                      labels={"author": "xingsheng"}
                     )

fn.apply(mlrun.mount_v3io())
fn.spec.build.image = image

### Build the image

In [43]:
fn.deploy(with_mlrun=False)

> 2022-03-04 20:29:50,481 [info] Started building image: .mlrun/sqltofile
E0304 20:29:52.116581       1 aws_credentials.go:77] while getting AWS credentials NoCredentialProviders: no valid providers in chain. Deprecated.
	For verbose messaging see aws.Config.CredentialsChainVerboseErrors
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:0.9.1  
[36mINFO[0m[0000] Retrieving image mlrun/mlrun:0.9.1 from registry index.docker.io 
[36mINFO[0m[0000] Built cross stage deps: map[]                
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:0.9.1  
[36mINFO[0m[0000] Returning cached image manifest              
[36mINFO[0m[0000] Executing 0 build triggers                   
[36mINFO[0m[0000] Unpacking rootfs as cmd RUN python -m pip install pyhive sqlalchemy PyMySQL requires it. 
[36mINFO[0m[0023] RUN python -m pip install pyhive sqlalchemy PyMySQL 
[36mINFO[0m[0023] Taking snapshot of full filesystem...        
[36mINFO[0m[0032] cmd: /bin/sh              

True

In [59]:
# print(fn.to_yaml())

In [60]:
fn.export('function.yaml')

> 2022-03-04 20:40:09,965 [info] function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f8dbca49ed0>

## Test

### Reading from a public MySQL DB

In [61]:
mysql_url = 'mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam'
mysql_query = 'select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM family'

In [62]:
from mlrun import NewTask, run_local

sql_task = NewTask(name='sql',
                   handler=sql_to_file,
                   params={'sql_query': mysql_query,
                           'database_url': mysql_url})

In [63]:
sql_func = run_local(sql_task)

> 2022-03-04 20:40:31,759 [info] starting run sql uid=7f6493bfe0104c109cd7071855d68dfa DB=http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
functions,...55d68dfa,0,Mar 04 20:40:31,completed,sql,v3io_user=xingshengkind=handlerowner=xingshenghost=jupyter-xingsheng-ddd79bd7c-b7btb,,"sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM familydatabase_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",,query result





> 2022-03-04 20:40:34,580 [info] run executed, status=completed


#### Run it on a cluster

In [64]:
fn.run(sql_task)

> 2022-03-04 20:40:51,598 [info] starting run sql uid=9137a1073ca64439beb8041cbfe8d283 DB=http://mlrun-api:8080
> 2022-03-04 20:40:52,041 [info] Job is running in the background, pod: sql-l4drl
> 2022-03-04 20:40:58,588 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
functions,...bfe8d283,0,Mar 04 20:40:55,completed,sql,v3io_user=xingshengkind=jobowner=xingshenghost=sql-l4drl,,"sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM familydatabase_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",,query result





> 2022-03-04 20:41:01,637 [info] run executed, status=completed


<mlrun.model.RunObject at 0x7f8dbc4ea290>

### Import the function

You need to create a table and set the sql_table path accordingly. <br>
you can find an example of creating such table in https://github.com/v3io/tutorials/blob/master/data-ingestion-and-preparation/basic-data-ingestion-and-preparation.ipynb<br>
function = mlrun.import_function("hub://sklearn_classifier")<br>
function = mlrun.import_function("./func.yaml")<br>
function = mlrun.import_function("https://raw.githubusercontent.com/org/repo/func.yaml")<br>

#### Set function example:
project.set_function('hub://describe', 'describe')

In [65]:
# project.set_function('sql_to_file.py', 'prep-data', kind='job', image='mlrun/mlrun')

#### load function examples:
my_describe = project.func('describe')

In [66]:
# fn = project.func('prep-data')

In [67]:
# fn = project.get_function('prep-data')

In [68]:
fn = mlrun.import_function("./function.yaml")

In [69]:
print(fn.to_yaml())

kind: job
metadata:
  name: sql-to-file
  tag: ''
  hash: ec914b3071626f878134db936a7d0b25fd6ef446
  project: functions
  labels:
    author: xingsheng
  categories:
  - data-prep
  credentials:
    access_key: 8a666262-887d-49b3-934e-092f6f8d3a89
spec:
  command: ''
  args: []
  image: .mlrun/sqltofile
  volumes: []
  volume_mounts: []
  env:
  - name: V3IO_API
    value: ''
  - name: V3IO_USERNAME
    value: ''
  - name: V3IO_ACCESS_KEY
    value: ''
  - name: V3IO_FRAMESD
    value: ''
  default_handler: sql_to_file
  entry_points:
    sql_to_file:
      name: sql_to_file
      doc: SQL Ingest - Ingest data using SQL query
      parameters:
      - name: context
        type: MLClientCtx
        doc: the function context
        default: ''
      - name: sql_query
        type: str
        doc: the sql query used to retrieve the data
        default: ''
      - name: database_url
        type: str
        doc: database connection URL
        default: ''
      - name: file_ext
      

In [54]:
# fn.spec.image=image

In [56]:
# print(fn.to_yaml())

In [70]:
fn.run(sql_task)

> 2022-03-04 20:41:40,767 [info] starting run sql uid=fc4ec1f394794f33becbcf8f11aeea2c DB=http://mlrun-api:8080
> 2022-03-04 20:41:41,420 [info] Job is running in the background, pod: sql-mjvcs
> 2022-03-04 20:41:48,211 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
functions,...11aeea2c,0,Mar 04 20:41:45,completed,sql,v3io_user=xingshengkind=jobowner=xingshenghost=sql-mjvcs,,"sql_query=select rfam_acc,rfam_id,auto_wiki,description,author,seed_source FROM familydatabase_url=mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam",,query result





> 2022-03-04 20:41:51,357 [info] run executed, status=completed


<mlrun.model.RunObject at 0x7f8dbc559850>