# Query Parquet

In [136]:
#!pip install mlrun
#!pip install PyHive

In [137]:
# nuclio: ignore
import nuclio

Define the MLRun environment

In [138]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/mlrun"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/mlrun'


## Function

In [139]:
from mlrun import get_or_create_ctx
import numpy as np
import pandas as pd
import pyarrow as pa
from pyarrow.parquet import read_schema
from pyhive import presto  # or import hive
import os


In [140]:
class PrestoClient(object):
    def __init__(self, logger, presto_uri, user_name, access_key):
        self.logger = logger
        self.presto_uri = presto_uri
        self.user_name = user_name
        self.access_key = access_key
        self.cursor = None

    def connect(self):
        req_kw = {'auth': (self.user_name, self.access_key), 'verify': False}
        self.cursor = presto.connect(self.presto_uri, port=443, username=self.user_name,
                                     protocol='https', requests_kwargs=req_kw).cursor()
        self.logger.info("connected to presto")

    def disconnect(self):
        self.cursor.close()

    def execute_command(self, command):
        self.logger.info("executing command {}".format(command))
        self.cursor.execute(command)
        response = self.cursor.fetchone()
        self.logger.info("cursor response {}".format(response))


class ParquetFile(object):
    def __init__(self, logger, file_path):
        self.logger = logger
        self.path = file_path
        self.schema = None
        self.set_schema(file_path)

    def set_schema(self, file_path):
        schema = read_schema(file_path)
        self.schema = schema

    def schema_to_tuple(self):
        schema_names = self.schema.names
        schema_types = self.schema.types
        schema_tuple = tuple(zip(schema_names, schema_types))
        return schema_tuple

class HiveTable(object):     
    def __init__(self, logger,user_name, name='hive_table', db_schema='default', location=os.getcwd()):
        self.logger = logger
        self.name = name
        self.db_schema = db_schema
        self.location = self.set_location(location, user_name)
        self.schema = None

    def set_location(self, location, user_name):
        dir_path = location + "/" + self.name + "/"        
        if not os.path.isdir(dir_path):
            os.mkdir(dir_path)
        dir_path = dir_path.replace('/User/', 'v3io://users/'+user_name+'/')    
        return dir_path

    def generate_create_table_script_from_parquet(self, pqt: ParquetFile):
        self.set_schema_from_parquet(pqt.schema_to_tuple())
        command = "CREATE TABLE " + "hive." \
                  + self.db_schema \
                  + "." + self.name \
                  + self.schema \
                  + " WITH (format = 'PARQUET' , " \
                  + " external_location = '" + self.location \
                  + "')"
        
        return command    
    
    def set_schema_from_parquet(self, parquet_schema):
        fields = "("
        for field, value in parquet_schema:
            fields += field + ' ' + str(value) + ','
        fields = fields[:-1]
        fields += ")"
        fields = fields.replace('string', 'VARCHAR')
        fields = fields.replace('bool', 'BOOLEAN')
        self.schema = fields    


def query_parquet(context, file_path, v3io_access_key, user_name, presto_uri, hive_table_name='hive_table_name'):
    """query_parquet function
    
    query parquet file via oresto over hive metastore    
    
    :param file_path:  parquet file path
    
    :param hive_table_name: the name of the hive table that will be created in hive metastore
    """
    pqt = ParquetFile(context.logger, file_path)
    
    hive = HiveTable(context.logger,user_name, hive_table_name)
    
    create_table = hive.generate_create_table_script_from_parquet(pqt)
    
    cli = PrestoClient(context.logger, presto_uri, user_name,
                       v3io_access_key)
    cli.connect()

    cli.execute_command(create_table)
    
    cli.disconnect()
    
    

In [141]:
# nuclio: end-code

## Test
> This test uses the metrics data, created by the [Generator function](https://github.com/mlrun/demo-network-operations/blob/master/notebooks/generator.ipynb) from MLRun's [Network Operations Demo](https://github.com/mlrun/demo-network-operations)  
To test it yourself, please generate this dataset or use any of your available csv/parquet datasets.

In [142]:
import os
from mlrun import code_to_function, mount_v3io, NewTask, mlconf, run_local
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
mlconf.artifact_path = mlconf.artifact_path or f'{os.environ["HOME"]}/artifacts'

### Define the execute test task

In [143]:
execute_task = NewTask(name='query-parquet',
                         project='proj',
                         params={'file_path':'example.parquet','v3io_access_key':'5860ab7b-160b-4e83-b9b3-61a8c0f3082e','user_name':'admin','presto_uri':'presto-api-presto.default-tenant.app.app-lab-development.iguazio-cd0.com'},                          
                         handler=query_parquet)

In [145]:
submit_run = run_local(execute_task)

[mlrun] 2020-07-01 12:57:32,259 starting run query-parquet uid=e2e0447eb69f4dbb864f2e2bb46f60d9  -> http://mlrun-api:8080
[mlrun] 2020-07-01 12:57:32,325 connected to presto
[mlrun] 2020-07-01 12:57:32,325 executing command CREATE TABLE hive.default.hive_table_name(one double,two VARCHAR,three BOOLEAN,__index_level_0__ VARCHAR) WITH (format = 'PARQUET' ,  external_location = 'v3io://users/admin/repos/functions/query_parquet/hive_table_name/')
[mlrun] 2020-07-01 12:57:34,387 cursor response (True,)



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
proj,...b46f60d9,0,Jul 01 12:57:32,completed,query-parquet,v3io_user=adminkind=handlerowner=adminhost=jupyter-66bcbdc885-7jtn6,,file_path=example.parquetv3io_access_key=5860ab7b-160b-4e83-b9b3-61a8c0f3082euser_name=adminpresto_uri=presto-api-presto.default-tenant.app.app-lab-development.iguazio-cd0.com,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run e2e0447eb69f4dbb864f2e2bb46f60d9 --project proj , !mlrun logs e2e0447eb69f4dbb864f2e2bb46f60d9 --project proj
[mlrun] 2020-07-01 12:57:34,436 run executed, status=completed


### Test on cluster

Convert the code to an MLRun function

In [None]:
fn = code_to_function('query_func', handler='query_parquet')
#fn.spec.service_account='mlrun-api'
fn.apply(mount_v3io())
fn.export('function.yaml')

In [None]:
execute_run = fn.run(execute_task)

### Show results