# Datafaucet

Datafaucet is a productivity framework for ETL, ML application. Simplifying some of the common activities which are typical in Data pipeline such as project scaffolding, data ingesting, start schema generation, forecasting etc.

In [1]:
import datafaucet as dfc
dfc.__version__

'0.9.1'

## Loading and Saving Data

In [2]:
dfc.project.load()

NOTICE dfc engine.py:__init__ Connecting to spark master: local[*]
NOTICE dfc engine.py:__init__ Engine context spark:2.4.4 successfully started


<datafaucet.project.Project at 0x7fb71c7df898>

In [3]:
def equal(a,b):
    cnt = a.exceptAll(b).count() + b.exceptAll(a).count()
    return cnt==0

def mask_rootdir(resource):
    d = resource.copy()
    if d['service']=='file':
        d['url'] = '<project_rootdir>/' + dfc.utils.relpath(d['url'], dfc.rootdir())
    return d

### Slow Changing Dimensions 


In [4]:
df = dfc.range(3)
df = df.cols.create('name').fake('first_name')
df = df.cols.create('amount').randint(10)
df = df.cols.create('purchase_date').fake('date_this_year')
df.cache()

df.data.grid(10)

Unnamed: 0,id,name,amount,purchase_date
0,0,Olivia,4,2019-09-26
1,1,Michael,8,2019-04-19
2,2,Jeffrey,9,2019-07-14


In [5]:
# basic save with overwrite
df.save('data/saved/dim', format='scd:parquet', mode='overwrite')
dfc.list('data/saved/dim').data.grid()

Unnamed: 0,name,type
0,part-00000-7938f918-2cfd-4006-941b-b77a437de1a...,FILE
1,_SUCCESS,FILE
2,._SUCCESS.crc,FILE
3,.part-00000-7938f918-2cfd-4006-941b-b77a437de1...,FILE


In [6]:
# save with partition by 'purchase_date'
df.save('data/saved/dim', format='scd:parquet', mode='overwrite', partitionBy='purchase_date')
dfc.list('data/saved/dim').data.grid()

Unnamed: 0,name,type
0,purchase_date=2019-04-19,DIRECTORY
1,purchase_date=2019-09-26,DIRECTORY
2,_SUCCESS,FILE
3,._SUCCESS.crc,FILE
4,purchase_date=2019-07-14,DIRECTORY


In [7]:
dfc.load('data/saved/dim', format='scd:parquet').data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,1,Michael,8,2019-04-19
1,2,Jeffrey,9,2019-07-14
2,0,Olivia,4,2019-09-26


In [8]:
dfc.load('data/saved/dim', format='parquet').data.grid()

Unnamed: 0,id,name,amount,_state,_updated,purchase_date
0,1,Michael,8,0,2019-12-03 02:40:34,2019-04-19
1,2,Jeffrey,9,0,2019-12-03 02:40:34,2019-07-14
2,0,Olivia,4,0,2019-12-03 02:40:34,2019-09-26


In [9]:
df = dfc.range(3)
df = df.cols.create('name').fake('first_name')
df = df.cols.create('amount').randint(10)
df = df.cols.create('purchase_date').fake('date_this_year')
df.cache()

df.data.grid(10)

Unnamed: 0,id,name,amount,purchase_date
0,0,Matthew,6,2019-03-04
1,1,Brian,7,2019-09-22
2,2,Brenda,8,2019-02-09


In [10]:
# todo: overwrite semantic as in delta (soft overwrite)
df.save('data/saved/dim', format='scd:parquet', mode='overwrite')
df_v0 = dfc.load('data/saved/dim', format='scd:parquet')
df_v0.data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,0,Matthew,6,2019-03-04
1,1,Brian,7,2019-09-22
2,2,Brenda,8,2019-02-09


In [11]:
dfc.load('data/saved/dim', format='parquet').data.grid()

Unnamed: 0,id,name,amount,purchase_date,_state,_updated
0,0,Matthew,6,2019-03-04,0,2019-12-03 02:40:54
1,1,Brian,7,2019-09-22,0,2019-12-03 02:40:54
2,2,Brenda,8,2019-02-09,0,2019-12-03 02:40:54


In [12]:
# append one
from datetime import datetime
df = df.rows.append([{'id':3, 'name':'Steve', 'amount':8, 'purchase_date': datetime(2019,4,28)}])
df.data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,0,Matthew,6,2019-03-04
1,1,Brian,7,2019-09-22
2,2,Brenda,8,2019-02-09
3,3,Steve,8,2019-04-28


In [13]:
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v1 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')
df_v1.data.grid()

NOTICE dfc engine.py:save_scd merge on=id, updated=0, added=1, deleted=0


Unnamed: 0,id,name,amount,purchase_date
0,0,Matthew,6,2019-03-04
1,1,Brian,7,2019-09-22
2,2,Brenda,8,2019-02-09
3,3,Steve,8,2019-04-28


In [14]:
# delete one
df = df.rows.delete('id = 2')
df.data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,0,Matthew,6,2019-03-04
1,1,Brian,7,2019-09-22
2,3,Steve,8,2019-04-28


In [15]:
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v2 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')
df_v2.data.grid()

NOTICE dfc engine.py:save_scd merge on=id, updated=0, added=0, deleted=1


Unnamed: 0,id,name,amount,purchase_date
0,0,Matthew,6,2019-03-04
1,1,Brian,7,2019-09-22
2,3,Steve,8,2019-04-28


In [16]:
import datetime as dt
version_after_delete = dt.datetime.now()

In [20]:
# modify two
df = df.rows.update([{'id':0, 'name':'Mel'}, {'id':1, 'amount':88}], on='id')
df.data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,0,Mel,6,2019-03-04
1,1,Brian,88,2019-09-22
2,3,Steve,8,2019-04-28


In [21]:
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v3 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')
df_v3.data.grid()

NOTICE dfc engine.py:save_scd merge on=id, updated=1, added=0, deleted=0


Unnamed: 0,id,name,amount,purchase_date
0,1,Brian,88,2019-09-22
1,0,Mel,6,2019-03-04
2,3,Steve,8,2019-04-28


In [23]:
#redo, nothing happens
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v4 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')

assert equal(df_v4, df_v3)

NOTICE dfc engine.py:save_scd merge on=id, updated=0, added=0, deleted=0


In [24]:
# all history changes (can be retrieved directly with the underlying format)

df = dfc.load('data/saved/dim', format='parquet')
df.data.grid()

Unnamed: 0,id,name,amount,purchase_date,_state,_updated
0,1,Brian,88,2019-09-22,0,2019-12-03 02:42:44
1,1,Brian,7,2019-09-22,1,2019-12-03 02:42:44
2,0,Mel,6,2019-03-04,0,2019-12-03 02:41:18
3,0,Matthew,6,2019-03-04,1,2019-12-03 02:41:18
4,0,Matthew,6,2019-03-04,0,2019-12-03 02:40:54
5,1,Brian,7,2019-09-22,0,2019-12-03 02:40:54
6,2,Brenda,8,2019-02-09,0,2019-12-03 02:40:54
7,2,Brenda,8,2019-02-09,1,2019-12-03 02:41:09
8,3,Steve,8,2019-04-28,0,2019-12-03 02:41:02


In [25]:
# first version
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=0)
assert equal(df, df_v0)

df.data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,0,Matthew,6,2019-03-04
1,1,Brian,7,2019-09-22
2,2,Brenda,8,2019-02-09


In [26]:
# last version (default)
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=-1)
assert equal(df, df_v3)

df.data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,1,Brian,88,2019-09-22
1,0,Mel,6,2019-03-04
2,3,Steve,8,2019-04-28


In [27]:
# second last version (default)
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=-2)
assert equal(df, df_v2)

df.data.grid()

AssertionError: 

In [28]:
# by date, datetime ( version after delete is version=2)
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=version_after_delete)
assert equal(df, df_v2)

df.data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,0,Matthew,6,2019-03-04
1,1,Brian,7,2019-09-22
2,3,Steve,8,2019-04-28


In [24]:
# save last version:
# take last snapshop
# add: append
# delete, update: overwrite partition

# ----
# partitionBy: _updated, _date, ?
# where: _updated > "2019-12-02 09:17:23"
# where: _updated > "2019-12-02"
# where: 3 (version)
# where: "2019-12-02 09:17:23"
# where: "2019-12-02"

In [3]:
# analyze scd history, deleted records, added records, modified records by field
df = dfc.load('data/saved/dim', format='parquet')
df.data.scd_analyze(merge_on='id')

Unnamed: 0,updated,upd,add,del,changes
0,2019-12-02 09:17:16,0,3,0,"{'name': 0, 'value': 0}"
1,2019-12-02 09:17:23,0,1,0,"{'name': 0, 'value': 0}"
2,2019-12-02 09:17:30,0,0,1,"{'name': 0, 'value': 0}"
3,2019-12-02 09:17:37,2,0,0,"{'name': 1, 'value': 1}"
