In [1]:
import datafaucet as dfc

In [2]:
# start the engine
project = dfc.project.load()

created SparkEngine
Init engine "spark"
Configuring packages:
  -  mysql:mysql-connector-java:8.0.12
  -  org.apache.hadoop:hadoop-aws:3.2.1
Configuring conf:
  -  spark.hadoop.fs.s3a.access.key : ****** (redacted)
  -  spark.hadoop.fs.s3a.endpoint : http://minio:9000
  -  spark.hadoop.fs.s3a.impl : org.apache.hadoop.fs.s3a.S3AFileSystem
  -  spark.hadoop.fs.s3a.path.style.access : true
  -  spark.hadoop.fs.s3a.secret.key : ****** (redacted)
Connecting to spark master: local[*]
Engine context spark:2.4.4 successfully started


In [3]:
spark = dfc.context()

In [4]:
df = spark.range(100)

In [5]:
df = (df
    .cols.create('g').randint(0,5)
    .cols.create('v').randint(0,100)
)

In [6]:
df.data.grid()

Unnamed: 0,id,g,v
0,0,0,56
1,1,0,34
2,2,3,8
3,3,2,64
4,4,0,52
...,...,...,...
95,95,1,55
96,96,3,35
97,97,4,56
98,98,2,19


In [7]:
from pyspark.sql import types as T
from pyspark.sql import functions as F

df.cols.get('v').groupby('g').agg(F.countDistinct, F.approx_count_distinct).data.grid()

Unnamed: 0,g,colname,<function countDistinct at 0x7f9aff991488>,<function approx_count_distinct at 0x7f9aff9910d0>
0,1,v,19,18
1,3,v,19,19
2,4,v,17,17
3,2,v,17,17
4,0,v,17,17


In [8]:
# using hll_init (less efficient)

from datafaucet.spark import aggregations as A
(df
    .cols.get('v').hll_init(alias='v_sketch')
    .cols.get('v_sketch').groupby('g').agg({'v_sketch':A.hll_merge()})
    .cols.get('v_sketch').hll_count(alias='hll_count')
    .cols.drop('v_sketch', 'colname')
).data.grid()



Unnamed: 0,g,hll_count
0,1,19
1,3,19
2,4,17
3,2,17
4,0,17


In [9]:
# using hll_init_agg (very efficient: sketch in memory)

from datafaucet.spark import aggregations as A
(df
    .cols.get('v').groupby('g').agg({'v_sketch':A.hll_init_agg()})
    .cols.get('v_sketch').hll_count(alias='hll_count')
    .cols.drop('v_sketch','colname')
).data.grid()

Unnamed: 0,g,hll_count
0,1,19
1,3,19
2,4,17
3,2,17
4,0,17


In [10]:
# using hll_init_agg (very efficient: sketch in memory)

from datafaucet.spark.aggregations import hll_init_agg
from datafaucet.spark.functions import hll_count
(df
    .groupby('g').agg(hll_init_agg()(F.col('v')).alias('v_sketch'))
    .select('g', hll_count()(F.col('v_sketch')).alias('hll_count'))
).data.grid()

Unnamed: 0,g,hll_count
0,1,19
1,3,19
2,4,17
3,2,17
4,0,17


In [11]:
#pre-aggregate the cube
df_cube = (df
    .cols.get('v').groupby('g').agg({'v_sketch':A.hll_init_agg()})
    .cols.drop('colname')
)
df_cube.data.grid()

Unnamed: 0,g,v_sketch
0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [12]:
# re-aqggregate on the cube with hll_merge + hll_count
(df_cube
    .cols.get('v_sketch').hll_count()
    .cols.rename('v_sketch', 'hll_count')
).data.grid()

Unnamed: 0,g,hll_count
0,1,19
1,3,19
2,4,17
3,2,17
4,0,17


In [13]:
# partial count on the cube: filter + hll_merge + hll_count
(df_cube
     .rows.filter("g in(0,2,4)")
     .cols.get('v_sketch').agg({'v_sketch':A.hll_merge()})
     .cols.get('v_sketch').hll_count()
     .cols.rename('v_sketch','count')
     .cols.drop('colname')
).data.grid()

Unnamed: 0,count
0,39


In [14]:
# verify the result with countDistinct, and approx_count_distinct
(df
    .rows.filter("g in(0,2,4)")
    .cols.get('v').agg(F.countDistinct, F.approx_count_distinct)
).data.grid()

Unnamed: 0,colname,<function countDistinct at 0x7f9aff991488>,<function approx_count_distinct at 0x7f9aff9910d0>
0,v,39,41
