# Datafaucet

Datafaucet is a productivity framework for ETL, ML application. Simplifying some of the common activities which are typical in Data pipeline such as project scaffolding, data ingesting, start schema generation, forecasting etc.

In [1]:
import datafaucet as dfc
dfc.__version__

'0.9.1'

## Loading and Saving Data

In [2]:
dfc.project.load()

NOTICE dfc engine.py:__init__ Connecting to spark master: local[*]
NOTICE dfc engine.py:__init__ Engine context spark:2.4.4 successfully started


<datafaucet.project.Project at 0x7f8d4dd59748>

In [3]:
def equal(a,b):
    cnt = a.exceptAll(b).count() + b.exceptAll(a).count()
    return cnt==0

def mask_rootdir(resource):
    d = resource.copy()
    if d['service']=='file':
        d['url'] = '<project_rootdir>/' + dfc.utils.relpath(d['url'], dfc.rootdir())
    return d

### Slow Changing Dimensions 


In [4]:
df = dfc.range(3)
df = df.cols.create('name').fake('first_name')
df = df.cols.create('value').randint(10)
df.cache()

df.data.grid(10)

Unnamed: 0,id,name,value
0,0,Amanda,9
1,1,Christine,9
2,2,Tiffany,8


In [5]:
df.save('data/saved/dim', format='scd:parquet', mode='overwrite')

True

In [6]:
dfc.load('data/saved/dim', format='scd:parquet').data.grid()

Unnamed: 0,id,name,value
0,0,Amanda,9
1,1,Christine,9
2,2,Tiffany,8


In [7]:
dfc.load('data/saved/dim', format='parquet').data.grid()

Unnamed: 0,id,name,value,_state,_updated
0,0,Amanda,9,0,2019-12-02 09:17:13
1,1,Christine,9,0,2019-12-02 09:17:13
2,2,Tiffany,8,0,2019-12-02 09:17:13


In [8]:
df = dfc.range(3)
df = df.cols.create('name').fake('first_name')
df = df.cols.create('value').randint(10)
df.cache()

df.data.grid(10)

Unnamed: 0,id,name,value
0,0,Renee,9
1,1,Rachel,8
2,2,David,5


In [9]:
# todo: overwrite semantic as in delta (soft overwrite)
df.save('data/saved/dim', format='scd:parquet', mode='overwrite')
df_v0 = dfc.load('data/saved/dim', format='scd:parquet')
df_v0.data.grid()

Unnamed: 0,id,name,value
0,0,Renee,9
1,1,Rachel,8
2,2,David,5


In [10]:
dfc.load('data/saved/dim', format='parquet').data.grid()

Unnamed: 0,id,name,value,_state,_updated
0,0,Renee,9,0,2019-12-02 09:17:16
1,1,Rachel,8,0,2019-12-02 09:17:16
2,2,David,5,0,2019-12-02 09:17:16


In [11]:
# append one
df = df.rows.append([{'id':3, 'name':'Steve', 'value':8}])
df.data.grid()

Unnamed: 0,id,name,value
0,0,Renee,9
1,1,Rachel,8
2,2,David,5
3,3,Steve,8


In [12]:
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v1 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')
df_v1.data.grid()

NOTICE dfc engine.py:save_scd merge on=id, updated=0, added=1, deleted=0


Unnamed: 0,id,name,value
0,0,Renee,9
1,1,Rachel,8
2,2,David,5
3,3,Steve,8


In [13]:
# delete one
df = df.rows.delete('id = 2')
df.data.grid()

Unnamed: 0,id,name,value
0,0,Renee,9
1,1,Rachel,8
2,3,Steve,8


In [14]:
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v2 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')
df_v2.data.grid()

NOTICE dfc engine.py:save_scd merge on=id, updated=0, added=0, deleted=1


Unnamed: 0,id,name,value
0,0,Renee,9
1,1,Rachel,8
2,3,Steve,8


In [15]:
import datetime as dt
version_after_delete = dt.datetime.now()

In [16]:
# modify two
df = df.rows.update([{'id':0, 'name':'Mel'}, {'id':1, 'value':88}], on='id')
df.data.grid()

Unnamed: 0,id,name,value
0,0,Mel,9
1,1,Rachel,88
2,3,Steve,8


In [17]:
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v3 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')
df_v3.data.grid()

NOTICE dfc engine.py:save_scd merge on=id, updated=2, added=0, deleted=0


Unnamed: 0,id,name,value
0,0,Mel,9
1,1,Rachel,88
2,3,Steve,8


In [18]:
#redo, nothing happens
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v4 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')

assert equal(df_v4, df_v3)

NOTICE dfc engine.py:save_scd merge on=id, updated=0, added=0, deleted=0


In [19]:
# all changes

df = dfc.load('data/saved/dim', format='parquet')
df.data.grid()

Unnamed: 0,id,name,value,_state,_updated
0,0,Mel,9,0,2019-12-02 09:17:37
1,1,Rachel,88,0,2019-12-02 09:17:37
2,1,Rachel,8,1,2019-12-02 09:17:37
3,0,Renee,9,1,2019-12-02 09:17:37
4,0,Renee,9,0,2019-12-02 09:17:16
5,1,Rachel,8,0,2019-12-02 09:17:16
6,2,David,5,0,2019-12-02 09:17:16
7,3,Steve,8,0,2019-12-02 09:17:23
8,2,David,5,1,2019-12-02 09:17:30


In [20]:
# first version
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=0)
assert equal(df, df_v0)

df.data.grid()

Unnamed: 0,id,name,value
0,0,Renee,9
1,1,Rachel,8
2,2,David,5


In [21]:
# last version (default)
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=-1)
assert equal(df, df_v3)

df.data.grid()

Unnamed: 0,id,name,value
0,0,Mel,9
1,1,Rachel,88
2,3,Steve,8


In [22]:
# second last version (default)
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=-2)
assert equal(df, df_v2)

df.data.grid()

Unnamed: 0,id,name,value
0,0,Renee,9
1,1,Rachel,8
2,3,Steve,8


In [23]:
# by date, datetime ( version after delete is version=2)
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=version_after_delete)
assert equal(df, df_v2)

df.data.grid()

Unnamed: 0,id,name,value
0,0,Renee,9
1,1,Rachel,8
2,3,Steve,8


In [24]:
#todo
# example with where

In [3]:
# analyze scd history, deleted records, added records, modified records by field
df = dfc.load('data/saved/dim', format='parquet')
df.data.scd_analyze(merge_on='id')

Unnamed: 0,updated,upd,add,del,changes
0,2019-12-02 09:17:16,0,3,0,"{'name': 0, 'value': 0}"
1,2019-12-02 09:17:23,0,1,0,"{'name': 0, 'value': 0}"
2,2019-12-02 09:17:30,0,0,1,"{'name': 0, 'value': 0}"
3,2019-12-02 09:17:37,2,0,0,"{'name': 1, 'value': 1}"
