In [1]:
import datafaucet as dfc

Datafaucet is a productivity framework for ETL, ML application. Simplifying some of the common activities which are typical in Data pipeline such as project scaffolding, data ingesting, start schema generation, forecasting etc.

## Loading and Saving Parquet Data

In [2]:
dfc.project.load('minimal')

 [datafaucet] NOTICE parquet.ipynb:engine:__init__ | Connecting to spark master: local[*]
 [datafaucet] NOTICE parquet.ipynb:engine:__init__ | Engine context spark:2.4.4 successfully started


<datafaucet.project.Project at 0x7ff5c84d3748>

In [3]:
dfc.metadata.profile()

profile: minimal
variables: {}
engine:
    type: spark
    master: local[*]
    jobname:
    timezone: naive
    submit:
        jars: []
        packages: []
        pyfiles:
        files:
        repositories:
        conf:
providers:
    local:
        service: file
        path: data
resources: {}
logging:
    level: info
    stdout: true
    file: datafaucet.log
    kafka: []

### Filter and projections Filters push down on parquet files

The following show how to selectively read files on parquet files (with partitions)

#### Create data

In [4]:
df = dfc.range(10000).cols.create('g').randchoice([0,1,2,3])
df.cols.groupby('g').agg('count').data.grid()

Unnamed: 0,g,id
0,0,2488
1,1,2488
2,3,2632
3,2,2392


#### Save data as parquet objects

In [5]:
df.repartition('g').save('local', 'groups.parquet');

 [datafaucet] INFO parquet.ipynb:engine:save_log | save


In [6]:
dfc.list('data/save/groups.parquet').data.grid()

Unnamed: 0,name,type
0,g=2,DIRECTORY
1,g=1,DIRECTORY
2,g=3,DIRECTORY
3,g=0,DIRECTORY
4,_SUCCESS,FILE
5,._SUCCESS.crc,FILE


#### Read data parquet objects (with pushdown filters)

In [7]:
spark = dfc.engine().context

In [8]:
df = spark.read.load('data/save/groups.parquet')

In [9]:
### No pushdown on the physical plan
df.explain()

== Physical Plan ==
*(1) FileScan parquet [id#91L,g#92] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/home/natbusa/Projects/datafaucet/examples/tutorial/data/save/groups.parquet], PartitionCount: 4, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:bigint>


In [10]:
### Pushdown only column selection
df.groupby('g').count().explain()

== Physical Plan ==
*(2) HashAggregate(keys=[g#92], functions=[count(1)])
+- Exchange hashpartitioning(g#92, 200)
   +- *(1) HashAggregate(keys=[g#92], functions=[partial_count(1)])
      +- *(1) FileScan parquet [g#92] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/home/natbusa/Projects/datafaucet/examples/tutorial/data/save/groups.parquet], PartitionCount: 4, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<>


In [11]:
# push down row filter only but take all partitions
df.filter('id>100').explain()

== Physical Plan ==
*(1) Project [id#91L, g#92]
+- *(1) Filter (isnotnull(id#91L) && (id#91L > 100))
   +- *(1) FileScan parquet [id#91L,g#92] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/home/natbusa/Projects/datafaucet/examples/tutorial/data/save/groups.parquet], PartitionCount: 4, PartitionFilters: [], PushedFilters: [IsNotNull(id), GreaterThan(id,100)], ReadSchema: struct<id:bigint>


In [12]:
# pushdown partition filters and row (columnar) filters
df.filter('id>100 and g=1').groupby('g').count().explain()

== Physical Plan ==
*(2) HashAggregate(keys=[g#92], functions=[count(1)])
+- Exchange hashpartitioning(g#92, 200)
   +- *(1) HashAggregate(keys=[g#92], functions=[partial_count(1)])
      +- *(1) Project [g#92]
         +- *(1) Filter (isnotnull(id#91L) && (id#91L > 100))
            +- *(1) FileScan parquet [id#91L,g#92] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/home/natbusa/Projects/datafaucet/examples/tutorial/data/save/groups.parquet], PartitionCount: 1, PartitionFilters: [isnotnull(g#92), (g#92 = 1)], PushedFilters: [IsNotNull(id), GreaterThan(id,100)], ReadSchema: struct<id:bigint>
