In [52]:
import datafaucet as dfc

Datafaucet is a productivity framework for ETL, ML application. Simplifying some of the common activities which are typical in Data pipeline such as project scaffolding, data ingesting, start schema generation, forecasting etc.

## Loading and Saving Parquet Data

In [53]:
dfc.project.load('minimal')

 [datafaucet] NOTICE parquet.ipynb:engine:__init__ | Connecting to spark master: local[*]
 [datafaucet] NOTICE parquet.ipynb:engine:__init__ | Engine context spark:2.4.4 successfully started


<datafaucet.project.Project at 0x7f3f40acb748>

In [54]:
dfc.metadata.profile()

profile: minimal
variables: {}
engine:
    type: spark
    master: local[*]
    jobname:
    timezone: naive
    submit:
        jars: []
        packages: []
        pyfiles:
        files:
        repositories:
        conf:
providers:
    local:
        service: file
        path: data
resources: {}
logging:
    level: info
    stdout: true
    file: datafaucet.log
    kafka: []

### Filter and projections Filters push down on parquet files

The following show how to selectively read files on parquet files (with partitions)

#### Create data

In [59]:
df = dfc.range(10000).cols.create('g').randchoice([0,1,2,3])
df.cols.groupby('g').agg('count').data.grid()

Unnamed: 0,g,id
0,0,2552
1,1,2400
2,3,2568
3,2,2480


#### Save data as parquet objects

In [62]:
df.repartition('g').save('local', 'groups.parquet');

 [datafaucet] INFO parquet.ipynb:engine:save_log | save


In [63]:
dfc.list('data/save/groups.parquet').data.grid()

Unnamed: 0,name,type
0,g=2,DIRECTORY
1,g=1,DIRECTORY
2,g=3,DIRECTORY
3,g=0,DIRECTORY
4,_SUCCESS,FILE
5,._SUCCESS.crc,FILE


#### Read data parquet objects (with pushdown filters)

In [64]:
spark = dfc.engine().context

In [65]:
df = dfc.load('local', 'groups.parquet')

 [datafaucet] INFO parquet.ipynb:engine:load_log | load


In [66]:
### No pushdown on the physical plan
df.explain()

== Physical Plan ==
*(1) FileScan parquet [id#253L,g#254L] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/home/natbusa/Projects/datafaucet/examples/tutorial/groups.parquet/data], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:bigint,g:bigint>


In [67]:
### Pushdown only column selection
df.groupby('g').count().explain()

== Physical Plan ==
*(2) HashAggregate(keys=[g#254L], functions=[count(1)])
+- Exchange hashpartitioning(g#254L, 200)
   +- *(1) HashAggregate(keys=[g#254L], functions=[partial_count(1)])
      +- *(1) FileScan parquet [g#254L] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/home/natbusa/Projects/datafaucet/examples/tutorial/groups.parquet/data], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<g:bigint>


In [68]:
# push down row filter only but take all partitions
df.filter('id>100').explain()

== Physical Plan ==
*(1) Project [id#253L, g#254L]
+- *(1) Filter (isnotnull(id#253L) && (id#253L > 100))
   +- *(1) FileScan parquet [id#253L,g#254L] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/home/natbusa/Projects/datafaucet/examples/tutorial/groups.parquet/data], PartitionFilters: [], PushedFilters: [IsNotNull(id), GreaterThan(id,100)], ReadSchema: struct<id:bigint,g:bigint>


In [69]:
# pushdown partition filters and row (columnar) filters
df.filter('id>100 and g=1').groupby('g').count().explain()

== Physical Plan ==
*(2) HashAggregate(keys=[g#254L], functions=[count(1)])
+- Exchange hashpartitioning(g#254L, 200)
   +- *(1) HashAggregate(keys=[g#254L], functions=[partial_count(1)])
      +- *(1) Project [g#254L]
         +- *(1) Filter (((isnotnull(id#253L) && isnotnull(g#254L)) && (id#253L > 100)) && (g#254L = 1))
            +- *(1) FileScan parquet [id#253L,g#254L] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/home/natbusa/Projects/datafaucet/examples/tutorial/groups.parquet/data], PartitionFilters: [], PushedFilters: [IsNotNull(id), IsNotNull(g), GreaterThan(id,100), EqualTo(g,1)], ReadSchema: struct<id:bigint,g:bigint>
