In [2]:
import sys
import os
sys.path.append('..')
import dfx.datasets as datasets
import dfx.grain as dfx

# Grain
The Grain functionality helps to identify what each each row in the dataset describes in real life.

For example:
 - an employee
 - a status change for an employee
 - in a reporting table ('cube'), a combination of dimensions
 
Grain relates to the fields that uniquely identify a row in a dataset. The functions below automate finding that unique combination, or help identify duplicates

# Describe dataset with single unique column

In [3]:
df = datasets.employees
g = dfx.GrainDf(df)
g.pprint()

Unique 100%, employee_id


In [4]:
g.unique

True

In [5]:
g.unique_rate

1.0

In [6]:
g.columns

('employee_id',)

# Dataset with single column, dup values

In [7]:
df = datasets.employee_dups.copy()
g = dfx.GrainDf(df, uniq_threshold=.6)
g.pprint()

Unique 62%, employee_id


In [8]:
g.duplicate_ids()

Unnamed: 0,row_count,employee_id
1,3,12345
2,2,24543


In [9]:
g.duplicate_rows()

Unnamed: 0,row_count,employee_id,region,state,salary,company,manager_id,status
0,3,12345,east,NY,100,Acme,36363,active
1,3,12345,east,NY,100,Acme,36363,active
2,3,12345,east,NY,100,Acme,36363,active
3,2,24543,east,NY,110,Acme,36363,active
4,2,24543,east,NY,110,Acme,36363,active


# Dataset with multi-column unique 

In [10]:
df = datasets.employee_hist.copy()
g = dfx.GrainDf(df)
g.pprint()

Unique 100%
   83% employee_id
   17% status
     employee_id          status               many:many


In [11]:
g.contrib

{'employee_id': 0.8333333333333334, 'status': 0.16666666666666666}

In [12]:
g.col_rels()

[('employee_id', 'status', 'many:many')]

# Dataset cube 

In [13]:
df = datasets.cube.copy()
g = dfx.GrainDf(df)
g.pprint()

Unique 100%
   33% dept
   33% status
   33% date
     dept                 status               many:many
     dept                 date                 many:many
     status               date                 many:many


In [16]:
df = datasets.regions.copy()
g = dfx.GrainDf(df)
g.pprint()

Unique 100%
   42% state
   58% town
     state                town                 many:many


In [17]:
df = datasets.regions.copy()
g = dfx.GrainDf(df, columns=['state', 'county', 'town'], force=True)
g.pprint()

Unique 100%
   29% state
   29% county
   41% town
     state                county               many:many
     state                town                 many:many
     county               town                 many:many


# Summarize perfect cross & missing combinations

In [18]:
# multi-column
g = dfx.GrainDf(datasets.employee_hist)
print(f"Is perfect: {g.perfect}")
print(f"Miss rate : {g.missing_rate:.0%}")
g.missing_rows

Is perfect: False
Miss rate : 40%


Unnamed: 0,employee_id,status
5,36363,old
7,48436,old
9,54664,old
11,69983,old
13,76576,old
15,87635,old
17,98765,old
19,0,old


In [19]:
# multi-column
g = dfx.GrainDf(datasets.regions)
print(f"Is perfect: {g.perfect}")
print(f"Miss rate : {g.missing_rate:.0%}")
g.missing_rows

Is perfect: False
Miss rate : 72%


Unnamed: 0,state,town
4,ny,sandiego
5,ny,chulavist
6,ny,keywest
7,ny,marathon
8,ca,pittsford
9,ca,penfield
10,ca,belfast
11,ca,wellsville
14,ca,keywest
15,ca,marathon


In [20]:
# cube
g = dfx.GrainDf(datasets.cube)
print(f"Is perfect: {g.perfect}")
print(f"Miss rate : {g.missing_rate:.0%}")
g.missing_rows

Is perfect: True
Miss rate : 0%


Unnamed: 0,dept,status,date


### Filter up

In [25]:
g = dfx.GrainDf(datasets.employee_hist)
g.status.filter_up('old')

Unnamed: 0,employee_id,region,state,salary,company,manager_id,status
0,12345,east,NY,100,Acme,36363,active
1,12345,east,NY,100,Acme,36363,old
2,24543,east,NY,110,Acme,36363,active
3,24543,east,NY,110,Acme,36363,old


In [26]:
g = dfx.GrainDf(datasets.regions, columns=['state', 'county', 'town'], force=True)
g.town.filter_up('penfield')

Unnamed: 0,state,county,town
0,ny,monroe,pittsford
1,ny,monroe,penfield


In [27]:
g.town.filter_up('penfield', also_above=['county'])

Unnamed: 0,state,county,town
0,ny,monroe,pittsford
1,ny,monroe,penfield
2,ny,allegany,belfast
3,ny,allegany,wellsville


In [29]:
# on a perfect dataset, filter_up returns all rows
g = dfx.GrainDf(datasets.cube)
g.dept.filter_up('finance').head()

Unnamed: 0,status,date,dept
0,active,01-01-2020,finance
1,active,01-01-2020,hr
2,active,01-01-2020,legal
3,active,02-01-2020,finance
4,active,02-01-2020,hr
