# Datafaucet

Datafaucet is a productivity framework for ETL, ML application. Simplifying some of the common activities which are typical in Data pipeline such as project scaffolding, data ingesting, start schema generation, forecasting etc.

In [1]:
import datafaucet as dfc
dfc.__version__

'0.9.2'

## Loading and Saving Data

In [2]:
dfc.project.load()

NOTICE:datafaucet:scd.ipynb:engine:set_submit_args | Configuring packages:
NOTICE:datafaucet:scd.ipynb:engine:set_submit_args |   -  mysql:mysql-connector-java:8.0.12
NOTICE:datafaucet:scd.ipynb:engine:__init__ | Connecting to spark master: local[*]
NOTICE:datafaucet:scd.ipynb:engine:__init__ | Engine context spark:2.4.4 successfully started


<datafaucet.project.Project at 0x7f6285dadf28>

In [3]:
def equal(a,b):
    cnt = a.exceptAll(b).count() + b.exceptAll(a).count()
    return cnt==0

def mask_rootdir(resource):
    d = resource.copy()
    if d['service']=='file':
        d['url'] = '<project_rootdir>/' + dfc.utils.relpath(d['url'], dfc.rootdir())
    return d

### Slow Changing Dimensions 


In [4]:
df = dfc.range(3)
df = df.cols.create('name').fake('first_name')
df = df.cols.create('amount').randint(10)
df = df.cols.create('purchase_date').fake('date_this_year')
df.cache()

df.data.grid(10)

Unnamed: 0,id,name,amount,purchase_date
0,0,Paul,4,2019-11-11
1,1,Kim,4,2019-04-25
2,2,Kristy,9,2019-03-16


In [5]:
# basic save with overwrite
df.save('data/saved/dim', format='scd:parquet', mode='overwrite')
dfc.list('data/saved/dim').data.grid()

INFO:datafaucet:scd.ipynb:engine:save_log | save


Unnamed: 0,name,type
0,.part-00000-8939c880-f200-41d1-b470-3199665439...,FILE
1,part-00000-8939c880-f200-41d1-b470-31996654393...,FILE
2,_SUCCESS,FILE
3,._SUCCESS.crc,FILE


In [6]:
# save with partition by 'purchase_date'
df.save('data/saved/dim', format='scd:parquet', mode='overwrite', partitionBy='purchase_date')
dfc.list('data/saved/dim').data.grid()

INFO:datafaucet:scd.ipynb:engine:save_log | save


Unnamed: 0,name,type
0,purchase_date=2019-04-25,DIRECTORY
1,purchase_date=2019-03-16,DIRECTORY
2,purchase_date=2019-11-11,DIRECTORY
3,_SUCCESS,FILE
4,._SUCCESS.crc,FILE


In [7]:
dfc.load('data/saved/dim', format='scd:parquet').data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,2,Kristy,9,2019-03-16
1,0,Paul,4,2019-11-11
2,1,Kim,4,2019-04-25


In [8]:
dfc.load('data/saved/dim', format='parquet').data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,_state,_updated,purchase_date
0,2,Kristy,9,0,2019-12-13 08:08:32,2019-03-16
1,0,Paul,4,0,2019-12-13 08:08:32,2019-11-11
2,1,Kim,4,0,2019-12-13 08:08:32,2019-04-25


In [9]:
# SCD

In [10]:
df = dfc.range(3)
df = df.cols.create('name').fake('first_name')
df = df.cols.create('amount').randint(10)
df = df.cols.create('purchase_date').fake('date_this_year')
df.cache()

df.data.grid(10)

Unnamed: 0,id,name,amount,purchase_date
0,0,Blake,2,2019-05-03
1,1,Lindsey,3,2019-05-10
2,2,Steven,5,2019-06-23


In [11]:
df.save('data/saved/dim', format='scd:parquet', mode='overwrite')
df_v0 = dfc.load('data/saved/dim', format='scd:parquet')
df_v0.data.grid()

INFO:datafaucet:scd.ipynb:engine:save_log | save
INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Blake,2,2019-05-03
1,1,Lindsey,3,2019-05-10
2,2,Steven,5,2019-06-23


In [12]:
dfc.load('data/saved/dim', format='parquet').data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date,_state,_updated
0,0,Blake,2,2019-05-03,0,2019-12-13 08:08:36
1,1,Lindsey,3,2019-05-10,0,2019-12-13 08:08:36
2,2,Steven,5,2019-06-23,0,2019-12-13 08:08:36


In [13]:
# append one
from datetime import datetime
df = df.rows.append([{'id':3, 'name':'Steve', 'amount':8, 'purchase_date': datetime(2019,4,28)}])
df.data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,0,Blake,2,2019-05-03
1,1,Lindsey,3,2019-05-10
2,2,Steven,5,2019-06-23
3,3,Steve,8,2019-04-28


In [14]:
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v1 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')
df_v1.data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load
NOTICE:datafaucet:scd.ipynb:engine:save_scd | merge on=id, updated=0, added=1, deleted=0
INFO:datafaucet:scd.ipynb:engine:save_log | save
INFO:datafaucet:scd.ipynb:engine:save_log | save
INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Blake,2,2019-05-03
1,1,Lindsey,3,2019-05-10
2,2,Steven,5,2019-06-23
3,3,Steve,8,2019-04-28


In [15]:
dfc.load('data/saved/dim', format='parquet').data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date,_state,_updated
0,0,Blake,2,2019-05-03,0,2019-12-13 08:08:36
1,1,Lindsey,3,2019-05-10,0,2019-12-13 08:08:36
2,2,Steven,5,2019-06-23,0,2019-12-13 08:08:36
3,3,Steve,8,2019-04-28,0,2019-12-13 08:08:43


In [16]:
# delete one
df = df.rows.delete('id = 2')
df.data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,0,Blake,2,2019-05-03
1,1,Lindsey,3,2019-05-10
2,3,Steve,8,2019-04-28


In [17]:
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v2 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')
df_v2.data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load
NOTICE:datafaucet:scd.ipynb:engine:save_scd | merge on=id, updated=0, added=0, deleted=1
INFO:datafaucet:scd.ipynb:engine:save_log | save
INFO:datafaucet:scd.ipynb:engine:save_log | save
INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Blake,2,2019-05-03
1,1,Lindsey,3,2019-05-10
2,3,Steve,8,2019-04-28


In [18]:
import datetime as dt
version_after_delete = dt.datetime.now()

In [19]:
# modify two
df = df.rows.update([{'id':0, 'name':'Mel'}, {'id':1, 'amount':88}], on='id')
df.data.grid()

Unnamed: 0,id,name,amount,purchase_date
0,0,Mel,2,2019-05-03
1,1,Lindsey,88,2019-05-10
2,3,Steve,8,2019-04-28


In [20]:
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v3 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')
df_v3.data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load
NOTICE:datafaucet:scd.ipynb:engine:save_scd | merge on=id, updated=2, added=0, deleted=0
INFO:datafaucet:scd.ipynb:engine:save_log | save
INFO:datafaucet:scd.ipynb:engine:save_log | save
INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Mel,2,2019-05-03
1,1,Lindsey,88,2019-05-10
2,3,Steve,8,2019-04-28


In [21]:
#redo, nothing happens
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
df_v4 = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')

assert equal(df_v4, df_v3)

INFO:datafaucet:scd.ipynb:engine:load_log | load
NOTICE:datafaucet:scd.ipynb:engine:save_scd | merge on=id, updated=0, added=0, deleted=0
INFO:datafaucet:scd.ipynb:engine:save_log | save
INFO:datafaucet:scd.ipynb:engine:save_log | save
INFO:datafaucet:scd.ipynb:engine:load_log | load


In [22]:
# all history changes (can be retrieved directly with the underlying format)

df = dfc.load('data/saved/dim', format='parquet')
df.data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date,_state,_updated
0,0,Mel,2,2019-05-03,0,2019-12-13 08:08:59
1,1,Lindsey,88,2019-05-10,0,2019-12-13 08:08:59
2,1,Lindsey,3,2019-05-10,1,2019-12-13 08:08:59
3,0,Blake,2,2019-05-03,1,2019-12-13 08:08:59
4,0,Blake,2,2019-05-03,0,2019-12-13 08:08:36
5,1,Lindsey,3,2019-05-10,0,2019-12-13 08:08:36
6,2,Steven,5,2019-06-23,0,2019-12-13 08:08:36
7,2,Steven,5,2019-06-23,1,2019-12-13 08:08:51
8,3,Steve,8,2019-04-28,0,2019-12-13 08:08:43


In [23]:
# first version
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=0)
assert equal(df, df_v0)

df.data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Blake,2,2019-05-03
1,1,Lindsey,3,2019-05-10
2,2,Steven,5,2019-06-23


In [24]:
# last version (default)
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=-1)
assert equal(df, df_v3)

df.data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Mel,2,2019-05-03
1,1,Lindsey,88,2019-05-10
2,3,Steve,8,2019-04-28


In [25]:
# second last version
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=-2)
assert equal(df, df_v2)

df.data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Blake,2,2019-05-03
1,1,Lindsey,3,2019-05-10
2,3,Steve,8,2019-04-28


In [26]:
# by date, datetime ( version after delete is version=2)
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id', version=version_after_delete)
assert equal(df, df_v2)

df.data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date
0,0,Blake,2,2019-05-03
1,1,Lindsey,3,2019-05-10
2,3,Steve,8,2019-04-28


In [27]:
# analyze scd history, deleted records, added records, modified records by field
df = dfc.load('data/saved/dim', format='parquet')
df.data.scd_analyze(merge_on='id')

INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,updated,upd,add,del,changes
0,2019-12-13 08:08:36,0,3,0,"{'name': 0, 'purchase_date': 0, 'amount': 0}"
1,2019-12-13 08:08:43,0,1,0,"{'name': 0, 'purchase_date': 0, 'amount': 0}"
2,2019-12-13 08:08:51,0,0,1,"{'name': 0, 'purchase_date': 0, 'amount': 0}"
3,2019-12-13 08:08:59,2,0,0,"{'name': 1, 'purchase_date': 0, 'amount': 1}"


### schema changes

In [31]:
#schema change: add a column
df = dfc.load('data/saved/dim', format='scd:parquet', merge_on='id')
df = df.cols.create('v').randint(0,10)
df.data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date,v
0,0,Mel,2,2019-05-03,8
1,1,Lindsey,88,2019-05-10,9
2,3,Steve,8,2019-04-28,5


In [32]:
df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
dfc.load('data/saved/dim', format='scd:parquet').data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load
NOTICE:datafaucet:scd.ipynb:engine:save_scd | merge on=id, updated=3, added=0, deleted=0
INFO:datafaucet:scd.ipynb:engine:save_log | save
INFO:datafaucet:scd.ipynb:engine:save_log | save
INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date,v
0,3,Steve,8,2019-04-28,5
1,1,Lindsey,88,2019-05-10,9
2,0,Mel,2,2019-05-03,8


In [33]:
#schema change: drop a column
df = df.cols.drop('v')
df.data.grid()

df.save('data/saved/dim', format='scd:parquet', mode='append', merge_on='id')
dfc.load('data/saved/dim', format='scd:parquet').data.grid()

INFO:datafaucet:scd.ipynb:engine:load_log | load
NOTICE:datafaucet:scd.ipynb:engine:save_scd | merge on=id, updated=0, added=0, deleted=0
INFO:datafaucet:scd.ipynb:engine:save_log | save
INFO:datafaucet:scd.ipynb:engine:save_log | save
INFO:datafaucet:scd.ipynb:engine:load_log | load


Unnamed: 0,id,name,amount,purchase_date,v
0,3,Steve,8,2019-04-28,5
1,1,Lindsey,88,2019-05-10,9
2,0,Mel,2,2019-05-03,8
