# Compare load APIs

## Spark

In [11]:
#generic
def load(self, path=None, format=None, schema=None, **options):
    pass

# specific
def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None,
            comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None,
            ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,
            negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,
            maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
            columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
            samplingRatio=None, enforceSchema=None, emptyValue=None):
    pass

```
spark = pyspark.Create()
spark.read.load('http://file', format='csv')
spark.read.csv('http://file')
```

## Optimus

In [12]:
#no generic

def csv(path, sep=',', header='true', infer_schema='true', *args, **kwargs):
    pass

```
op = Optimus()
op.load.csv('http://file')
```

## Dask/Pandas

In [13]:
def read_csv(urlpath, blocksize=64000000, collection=True, lineterminator=None, 
             sample=256000, enforce=False, assume_missing=False, float_precision=None,
             storage_options=None, include_path_column=False, sep=', ', delimiter=None, 
             header='infer', names=None, index_col=None, usecols=None, squeeze=False, 
             prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, 
             true_values=None, false_values=None, skipinitialspace=False, skiprows=None, 
             skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, 
             verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, 
             keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, 
             compression='infer', thousands=None, decimal=b'.', quotechar='"', memory_map=False,
             quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, dialect=None, 
             tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, delim_whitespace=False, 
             low_memory=True):
    pass

```
import dask.dataframe as dd
import pandas as pd

dd.read_csv('http://file')
pd.read_csv('http://file')
```

## Koalas

In [14]:
def read_csv(path, header='infer', names=None, usecols=None, 
             mangle_dupe_cols=True, parse_dates=False, comment=None):
    pass

```
import databricks.koalas as ks

ks.read_csv('http://file')
```

## Datafaucet

In [15]:
# generic (uniform, but opaque)
def read(path, provider=None, **kwargs):
    pass

# specific (options are listed directly)
def read_csv(path, provider=None, sep=None, encoding=None, quote=None, escape=None,
            comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None,
            ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,
            negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,
            maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
            columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
            samplingRatio=None, enforceSchema=None, emptyValue=None):
    pass

In [16]:
a = None
isinstance(a, type(None))

True

```
path:
    a metadata alias
    a jdbc uri (jdbc:<subprotocol>:<subname>)
    a url ( hdfs://)
    a local file 
    a table name (provider is a jdbc service)
    a sql query (provider is a jdbc service)
    a resource

provider:
    a metadata alias
    a jdbc uri (jdbc:<subprotocol>:<subname>)
    a url ( hdfs://)
    a local directory
    a data service (like elastic HTTP REST api)
    a resource ( defined with resource( ...) )
    
a resource is an object (currently a dict), 
it can be defined inline in the code or in the metadata yaml file

resource(path_or_url, alias, host, service, port, user, password, driver, database, schema, table, format, hostname, username, **kwargs)

merging resources:
a resource with no schema/service/format is 'abstract'
if service is defined the resource is 'concrete'

resource:
    <alias_name>:
        path,
        format
        
        host | hostname
        port

        driver,
        database,
        schema,
        table,
        user | username ,
        password,
        
        parent'

service: mysql
        format: jdbc
        path: sakila
        hostname: mysql
        username: sakila
        password: "{{ env('MYSQL_USER_PASSWORD') }}"
        database: 
        options:
           a: 1
           b: true

```

Why making splitting the request in path and provider?

Because in actual projects, you don't want to hardcode the resource.   
Think for instance at user-specific locations, or at dev/test/stage/prod profiles for data processing.  
This API is meant to be flexible enough to support these scenario's, yet easy enough for prototyping and experimentation.

In [1]:
from datafaucet.resource import *

In [2]:
metadata.load()
tests = [
    resource('SELECT 0 as result where 1 = 0', 'pagila'),
    resource('foo.csv', '/bar'),
    resource('foo.csv', 'bar'),
    resource('foo.csv', 'hdfs'),
    resource('/foo.abc', 'hdfs'),
    resource('/foo.abc', 'test'),
    resource('hello/foo.abc', 'test'),
    resource('foo.abc', 'hdfs://hdfs-namenode:8020/wanna/dance/with/somebody'),
    resource('/foo.abc', 'hdfs://hdfs-namenode/wanna/dance/with/somebody'),
    resource('/foo.abc', 'hdfs://hdfs-namenode:8020/wanna/dance/with/somebody'),
    resource('staff', 'jdbc:mysql://1.2.3.4:3306/sakila?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL'),
    resource('staff', 'jdbc:mysql://pippo:baudo@1.2.3.4:3306/sakila?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL'),
    resource('staff', 'jdbc:mysql://pippo:baudo@1.2.3.4:3306/sakila?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL', useSSL='new!', serverTimezone='new!'),
    resource('staff', 'jdbc:mysql://1.2.3.4/sakila', useSSL='false', serverTimezone='UTC', zeroDateTimeBehavior='CONVERT_TO_NULL'),
    resource('staff', service='mysql', database='sakila', serverTimezone='UTC'),
    resource('sakila/staff', service='mysql', serverTimezone='UTC', user='pippo', password='baudo'),
    resource('foo/bar.tsv', service='s3a'),
    resource('/foo/bar.tsv', service='s3a'),
    resource('/apples/orange', service='minio'),
    resource('SELECT count(*) as cnt from employees;', 'jdbc:mysql://1.2.3.4:3306/sakila?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL', user='pippo', password='baudo'),
    resource('ascombe'),
    resource('ascombe', 'saywhat'),
    resource('ascombe', 'hdfs://hdfs-namenode:8020/otherpath/'),
    resource('ascombe', 'hdfs'),
    resource('ascombe', 'test'),
    resource('r_test', 'test'),
]

s = """
    resource('SELECT 0 as result where 1 = 0', 'pagila'),
    resource('foo.csv', '/bar'),
    resource('foo.csv', 'bar'),
    resource('foo.csv', 'hdfs'),
    resource('/foo.abc', 'hdfs'),
    resource('/foo.abc', 'test'),
    resource('hello/foo.abc', 'test'),
    resource('foo.abc', 'hdfs://hdfs-namenode:8020/wanna/dance/with/somebody'),
    resource('/foo.abc', 'hdfs://hdfs-namenode/wanna/dance/with/somebody'),
    resource('/foo.abc', 'hdfs://hdfs-namenode:8020/wanna/dance/with/somebody'),
    resource('staff', 'jdbc:mysql://1.2.3.4:3306/sakila?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL'),
    resource('staff', 'jdbc:mysql://pippo:baudo@1.2.3.4:3306/sakila?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL'),
    resource('staff', 'jdbc:mysql://pippo:baudo@1.2.3.4:3306/sakila?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL', useSSL='new!', serverTimezone='new!'),
    resource('staff', 'jdbc:mysql://1.2.3.4/sakila', useSSL='false', serverTimezone='UTC', zeroDateTimeBehavior='CONVERT_TO_NULL'),
    resource('staff', service='mysql', database='sakila', serverTimezone='UTC'),
    resource('sakila/staff', service='mysql', serverTimezone='UTC', user='pippo', password='baudo'),
    resource('foo/bar.tsv', service='s3a'),
    resource('/foo/bar.tsv', service='s3a'),
    resource('/apples/orange', service='minio'),
    resource('SELECT count(*) as cnt from employees;', 'jdbc:mysql://1.2.3.4:3306/sakila?useSSL=false&serverTimezone=UTC&zeroDateTimeBehavior=CONVERT_TO_NULL', user='pippo', password='baudo'),
    resource('ascombe'),
    resource('ascombe', 'saywhat'),
    resource('ascombe', 'hdfs://hdfs-namenode:8020/otherpath/'),
    resource('ascombe', 'hdfs'),
    resource('ascombe', 'test'),
    resource('r_test', 'test'),
""".split('),')

for t in zip(s, tests):
    print('#'*10)
    print(t[0].lstrip()+')')
    print('-'*10)
    print(t[1])

##########
resource('SELECT 0 as result where 1 = 0', 'pagila')
----------
hash: '0x91507a3213f163a3'
url: jdbc:postgresql://127.0.0.1:5432/pagila
service: postgres
format: jdbc
host: 127.0.0.1
port: 5432
driver: org.postgresql.Driver
database: pagila
schema: public
table: ( SELECT 0 as result where 1 = 0 ) as _query
user:
password: postgres
options: {}

##########
resource('foo.csv', '/bar')
----------
hash: '0x3970b796b788cffb'
url: /bar/foo.csv
service: file
format: csv
host: 127.0.0.1
options: {}

##########
resource('foo.csv', 'bar')
----------
hash: '0xb92c7417c6d17d69'
url: /home/jovyan/work/tutorial/bar/foo.csv
service: file
format: csv
host: 127.0.0.1
options: {}

##########
resource('foo.csv', 'hdfs')
----------
hash: '0xf74c5a930e36238'
url: hdfs://127.0.0.1:8020/foo.csv
service: hdfs
format: csv
host: 127.0.0.1
port: 8020
options:
    header: true
    inferSchema: true

##########
resource('/foo.abc', 'hdfs')
----------
hash: '0x7268d550e983ee5'
url: hdfs://127.0.0.1:8020/f

In [2]:
resource('staff.csv', 'abracadabra')

hash: '0x32e53bf0099609fb'
url: /home/jovyan/work/tutorial/abracadabra/staff.csv
service: file
format: csv
host: 127.0.0.1
port:
options: {}

In [3]:
resource('file:///home/jovyan/work/tutorial/abracadabra/staff.csv')

AttributeError: 'NoneType' object has no attribute 'split'

In [12]:
import functools
functools.reduce(lambda a,b: a^b, [10,10,10, 10])

0

In [1]:
from datafaucet.resource import resource

In [2]:
jdbc = resource('jdbc:mysql://1.2.3.4/sakila', useSSL='false', serverTimezone='UTC', zeroDateTimeBehavior='CONVERT_TO_NULL')
jdbc

hash: '0x91bc765d6b9d670f'
url: jdbc:mysql://1.2.3.4:3306/sakila
service: mysql
format: jdbc
host: 1.2.3.4
port: 3306
driver: com.mysql.cj.jdbc.Driver
database: sakila
schema: sakila
table: ( SELECT 0 as result where 1 = 0 ) as _query
user:
password:
options:
    useSSL: 'false'
    serverTimezone: UTC
    zeroDateTimeBehavior: CONVERT_TO_NULL

In [3]:
table = resource('staff')
table

hash: '0xe483b933cd090226'
url: /home/jovyan/work/tutorial/staff
service: file
format:
host: 127.0.0.1
port:
options: {}

In [4]:
resource('foobar', table)

hash: '0xedff646c33db78c0'
url: /home/jovyan/work/tutorial/staff/foobar
service: file
format:
host: 127.0.0.1
port:
options: {}