# Datafaucet

Datafaucet is a productivity framework for ETL, ML application. Simplifying some of the common activities which are typical in Data pipeline such as project scaffolding, data ingesting, start schema generation, forecasting etc.

In [1]:
import datafaucet as dfc
dfc.__version__

'0.10.0'

## Loading and Saving Data

In [34]:
dfc.project.load()

 [datafaucet] NOTICE scd.ipynb:engine:set_submit_args | Configuring packages:
 [datafaucet] NOTICE scd.ipynb:engine:set_submit_args |   -  mysql:mysql-connector-java:8.0.12
 [datafaucet] NOTICE scd.ipynb:engine:__init__ | Connecting to spark master: local[*]
 [datafaucet] NOTICE scd.ipynb:engine:start_session | Engine context spark:3.0.0-SNAPSHOT successfully started


<datafaucet.project.Project at 0x7f21132dd2d0>

In [35]:
def equal(a,b):
    cnt = a.exceptAll(b).count() + b.exceptAll(a).count()
    return cnt==0

def mask_rootdir(resource):
    d = resource.copy()
    if d['service']=='file':
        d['url'] = '<project_rootdir>/' + dfc.utils.relpath(d['url'], dfc.rootdir())
    return d

### Slow Changing Dimensions 


In [36]:
df = dfc.range(3)
df = df.cols.create('name').fake('first_name')
df = df.cols.create('amount').randint(10)
df = df.cols.create('purchase_date').fake('date_this_year')
df.cache()

df.data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,0,Jeffrey,9,2020-01-16
1,1,Helen,9,2020-01-16
2,2,Vanessa,6,2020-01-29


In [37]:
# basic save with overwrite
df.save('data/saved/dim', format='scd:parquet', mode='overwrite')
dfc.list('data/saved/dim').data.grid()

 [datafaucet] INFO scd.ipynb:engine:save_log | save
 [datafaucet] INFO scd.ipynb:engine:save_log | save


Unnamed: 0,name,type
0,.part-00000-2827b56d-8110-455c-88e8-b4b6406d7d...,FILE
1,part-00000-2827b56d-8110-455c-88e8-b4b6406d7d7...,FILE
2,_SUCCESS,FILE
3,._SUCCESS.crc,FILE


In [38]:
# save with partition by 'purchase_date'
df.save('data/saved/dim', format='scd:parquet', mode='overwrite', partitionBy='purchase_date')
dfc.list('data/saved/dim').data.grid()

 [datafaucet] INFO scd.ipynb:engine:save_log | save
 [datafaucet] INFO scd.ipynb:engine:save_log | save


Unnamed: 0,name,type
0,purchase_date=2020-01-29,DIRECTORY
1,purchase_date=2020-01-16,DIRECTORY
2,_SUCCESS,FILE
3,._SUCCESS.crc,FILE


In [39]:
dfc.load('data/saved/dim', format='scd:parquet').data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Jeffrey,9,2020-01-16
1,2,Vanessa,6,2020-01-29
2,1,Helen,9,2020-01-16


In [40]:
dfc.load('data/saved/dim', format='parquet').data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,_state,_updated,purchase_date
0,2,Vanessa,6,0,2020-02-05 04:07:59,2020-01-29
1,0,Jeffrey,9,0,2020-02-05 04:07:59,2020-01-16
2,1,Helen,9,0,2020-02-05 04:07:59,2020-01-16


In [41]:
# SCD

In [42]:
df = dfc.range(3)
df = df.cols.create('name').fake('first_name')
df = df.cols.create('amount').randint(10)
df = df.cols.create('purchase_date').fake('date_this_year')
df.cache()

df.data.grid(10)

Unnamed: 0,id,name,amount,purchase_date
0,0,Katrina,5,2020-01-08
1,1,Nicole,3,2020-01-14
2,2,Brett,9,2020-01-13


In [43]:
df.save('data/saved/dim', format='scd:parquet', mode='overwrite')
df_v0 = dfc.load('data/saved/dim', format='scd:parquet')
df_v0.data.grid()

 [datafaucet] INFO scd.ipynb:engine:save_log | save
 [datafaucet] INFO scd.ipynb:engine:save_log | save
 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Katrina,5,2020-01-08
1,1,Nicole,3,2020-01-14
2,2,Brett,9,2020-01-13


In [44]:
dfc.load('data/saved/dim', format='parquet').data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date,_state,_updated
0,0,Katrina,5,2020-01-08,0,2020-02-05 04:08:03
1,1,Nicole,3,2020-01-14,0,2020-02-05 04:08:03
2,2,Brett,9,2020-01-13,0,2020-02-05 04:08:03


In [45]:
# append one
from datetime import datetime
df = df.rows.append([{'id':3, 'name':'Steve', 'amount':8, 'purchase_date': datetime(2019,4,28)}])
df.data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,0,Katrina,5,2020-01-08
1,1,Nicole,3,2020-01-14
2,2,Brett,9,2020-01-13
3,3,Steve,8,2019-04-28


In [46]:
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v1 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')
df_v1.data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load
 [datafaucet] NOTICE scd.ipynb:engine:save_scd | merge on=id, updated=0, added=1, deleted=0
 [datafaucet] INFO scd.ipynb:engine:save_log | save
 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Katrina,5,2020-01-08
1,1,Nicole,3,2020-01-14
2,3,Steve,8,2019-04-28
3,2,Brett,9,2020-01-13


In [47]:
dfc.load('data/saved/dim', format='parquet').data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date,_state,_updated
0,0,Katrina,5,2020-01-08,0,2020-02-05 04:08:03
1,1,Nicole,3,2020-01-14,0,2020-02-05 04:08:03
2,2,Brett,9,2020-01-13,0,2020-02-05 04:08:03
3,3,Steve,8,2019-04-28,0,2020-02-05 04:08:11


In [48]:
# delete one
df = df.rows.delete('id = 2')
df.data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,0,Katrina,5,2020-01-08
1,1,Nicole,3,2020-01-14
2,3,Steve,8,2019-04-28


In [49]:
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v2 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')
df_v2.data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load
 [datafaucet] NOTICE scd.ipynb:engine:save_scd | merge on=id, updated=0, added=0, deleted=1
 [datafaucet] INFO scd.ipynb:engine:save_log | save
 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Katrina,5,2020-01-08
1,1,Nicole,3,2020-01-14
2,3,Steve,8,2019-04-28


In [50]:
import datetime as dt
version_after_delete = dt.datetime.now()

In [51]:
# modify two
df = df.rows.update([{'id':0, 'name':'Mel'}, {'id':1, 'amount':88}], on='id')
df.data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,0,Mel,5,2020-01-08
1,1,Nicole,88,2020-01-14
2,3,Steve,8,2019-04-28


In [52]:
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v3 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')
df_v3.data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load
 [datafaucet] NOTICE scd.ipynb:engine:save_scd | merge on=id, updated=2, added=0, deleted=0
 [datafaucet] INFO scd.ipynb:engine:save_log | save
 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Mel,5,2020-01-08
1,1,Nicole,88,2020-01-14
2,3,Steve,8,2019-04-28


In [53]:
#redo, nothing happens
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v4 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')

assert equal(df_v4, df_v3)

 [datafaucet] INFO scd.ipynb:engine:load_log | load
 [datafaucet] NOTICE scd.ipynb:engine:save_scd | merge on=id, updated=0, added=0, deleted=0
 [datafaucet] INFO scd.ipynb:engine:save_log | save
 [datafaucet] INFO scd.ipynb:engine:load_log | load


In [54]:
# all history changes (can be retrieved directly with the underlying format)

df = dfc.load('data/saved/dim', format='parquet')
df.data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date,_state,_updated
0,1,Nicole,88,2020-01-14,0,2020-02-05 04:08:34
1,0,Mel,5,2020-01-08,0,2020-02-05 04:08:34
2,1,Nicole,3,2020-01-14,1,2020-02-05 04:08:34
3,0,Katrina,5,2020-01-08,1,2020-02-05 04:08:34
4,0,Katrina,5,2020-01-08,0,2020-02-05 04:08:03
5,1,Nicole,3,2020-01-14,0,2020-02-05 04:08:03
6,2,Brett,9,2020-01-13,0,2020-02-05 04:08:03
7,3,Steve,8,2019-04-28,0,2020-02-05 04:08:11
8,2,Brett,9,2020-01-13,1,2020-02-05 04:08:21


In [55]:
# first version
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=0)
assert equal(df, df_v0)

df.data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Katrina,5,2020-01-08
1,1,Nicole,3,2020-01-14
2,2,Brett,9,2020-01-13


In [56]:
# last version (default)
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=-1)
assert equal(df, df_v3)

df.data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Mel,5,2020-01-08
1,1,Nicole,88,2020-01-14
2,3,Steve,8,2019-04-28


In [57]:
# second last version
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=-2)
assert equal(df, df_v2)

df.data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Katrina,5,2020-01-08
1,1,Nicole,3,2020-01-14
2,3,Steve,8,2019-04-28


In [58]:
# by date, datetime ( version after delete is version=2)
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=version_after_delete)
assert equal(df, df_v2)

df.data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Katrina,5,2020-01-08
1,1,Nicole,3,2020-01-14
2,3,Steve,8,2019-04-28


In [59]:
# analyze scd history, deleted records, added records, modified records by field
df = dfc.load('data/saved/dim', format='parquet')
df.data.scd_analyze(merge_on='id')

 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,updated,upd,add,del,changes
0,2020-02-05 04:08:03,0,3,0,"{'name': 0, 'purchase_date': 0, 'amount': 0}"
1,2020-02-05 04:08:11,0,1,0,"{'name': 0, 'purchase_date': 0, 'amount': 0}"
2,2020-02-05 04:08:21,0,0,1,"{'name': 0, 'purchase_date': 0, 'amount': 0}"
3,2020-02-05 04:08:34,2,0,0,"{'name': 1, 'purchase_date': 0, 'amount': 1}"


### schema changes

In [60]:
#schema change: add a column
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')
df = df.cols.create('v').randint(0,10)
df.data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date,v
0,0,Mel,5,2020-01-08,9
1,1,Nicole,88,2020-01-14,9
2,3,Steve,8,2019-04-28,9


In [61]:
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
dfc.load('data/saved/dim', format='scd:parquet').data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load
 [datafaucet] NOTICE scd.ipynb:engine:save_scd | merge on=id, updated=3, added=0, deleted=0
 [datafaucet] INFO scd.ipynb:engine:save_log | save
 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date,v
0,0,Mel,5,2020-01-08,9
1,1,Nicole,88,2020-01-14,9
2,3,Steve,8,2019-04-28,9


In [62]:
#schema change: drop a column
df = df.cols.drop('v')
df.data.grid()

df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
dfc.load('data/saved/dim', format='scd:parquet').data.grid()

 [datafaucet] INFO scd.ipynb:engine:load_log | load
 [datafaucet] NOTICE scd.ipynb:engine:save_scd | merge on=id, updated=0, added=0, deleted=0
 [datafaucet] INFO scd.ipynb:engine:save_log | save
 [datafaucet] INFO scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date,v
0,0,Mel,5,2020-01-08,9
1,1,Nicole,88,2020-01-14,9
2,3,Steve,8,2019-04-28,9
