In [1]:
from data_wrangling_components.pipeline import Pipeline
import data_wrangling_components.types as types

## Aggregated Lookup



### Load files into the table store

In [2]:
aggregated_lookup = Pipeline()

aggregated_lookup.add_dataset('companies', path='../data/companies.csv')
aggregated_lookup.add_dataset('products', path='../data/products.csv')

### Create steps

In [3]:
aggregated_lookup.add(
    types.Step(
        types.Verb.Join,
        "companies",
        "join-1",
        args={"other": "products", "on": ["ID"]},
    )
)

aggregated_lookup.add(
    types.Step(
        types.Verb.Filter,
        "join-1",
        "filter-1",
        args={
            "to": "",
            "column": "FY21 Sales",
            "operator": types.NumericComparisonOperator.Gte,
            "type": types.FilterCompareType.Value,
            "value": 10000,
        },
    )
)

aggregated_lookup.add(
    types.Step(
        types.Verb.Aggregate,
        "filter-1",
        "aggregate-1",
        args={
            "to": "Total sales >= 10k",
            "groupby": "ID",
            "column": "FY21 Sales",
            "operation": types.FieldAggregateOperation.Sum,
        },
    )
)

aggregated_lookup.add(
    types.Step(
        types.Verb.Lookup,
        "companies",
        "lookup-1",
        args={"other": "aggregate-1", "on": ["ID"], "columns": ["Total sales >= 10k"]},
    )
)

### Run pipeline

In [4]:
aggregated_lookup.run()

Unnamed: 0,ID,Name,Employees,US,Total sales >= 10k
0,1,Microsoft,160000,True,3206000
1,2,Apple,150000,True,4180000
2,3,Google,135000,True,660000
3,4,Amazon,1250000,True,38000
4,5,Samsung,270000,False,630000


### Check Results

In [5]:
# List all results in the store:
print(aggregated_lookup.list_store())

# Check final result:
lookup_final_result = aggregated_lookup.get_dataset('lookup-1')

## You can save the output to a csv file with:
# lookup_final_result.to_csv('name.csv')

## You can check other intermediate results by using the store and providing
## any name from the list above, for example: 
# aggregated_lookup_store.get('filter-1')

# Preview table:
lookup_final_result.head()

['companies', 'products', 'join-1', 'filter-1', 'aggregate-1', 'lookup-1']


Unnamed: 0,ID,Name,Employees,US,Total sales >= 10k
0,1,Microsoft,160000,True,3206000
1,2,Apple,150000,True,4180000
2,3,Google,135000,True,660000
3,4,Amazon,1250000,True,38000
4,5,Samsung,270000,False,630000


## Reading from json spec

In [6]:
import json

with open('binning.json', 'r') as binning_spec:
    pipeline_spec = json.loads(binning_spec.read())

In [7]:

# Create pipeline
binning_pipeline = Pipeline.from_json(pipeline_spec['steps'])

# Load inputs into store
binning_pipeline.add_dataset('data/products.csv', path='../data/products.csv')

In [8]:
# Run pipeline
binning_pipeline.run()


Unnamed: 0,ID,Product,FY20 Sales,FY21 Sales,Binned
0,1,Xbox One,10000.0,9000,20000
1,1,Xbox Series X,9000.0,20000,20000
2,1,Surface Pro,80000.0,85000,20000
3,1,Surface Book,1300.0,11000,20000
4,1,Windows,4000000.0,3000000,1000000
5,1,Azure,50000.0,90000,20000
6,2,iPhone 13,90000.0,120000,120000
7,2,Watch Series 7,2000.0,3000,20000
8,2,MacBook Pro M1,89000.0,60000,20000
9,2,iOS,5000000.0,4000000,1000000


In [9]:
# Preview result in store
binning_pipeline.get_dataset('bin-5').head()

Unnamed: 0,ID,Product,FY20 Sales,FY21 Sales,Binned
0,1,Xbox One,10000.0,9000,20000
1,1,Xbox Series X,9000.0,20000,20000
2,1,Surface Pro,80000.0,85000,20000
3,1,Surface Book,1300.0,11000,20000
4,1,Windows,4000000.0,3000000,1000000


## Another Example

In [10]:
import json

with open('compound-filter-aggregate.json', 'r') as compound_spec:
    pipeline_spec = json.loads(compound_spec.read())

In [11]:

# Create pipeline
compound_pipeline = Pipeline.from_json(pipeline_spec['steps'])


# Load inputs into store
compound_pipeline.add_dataset('data/companies.csv', path='../data/companies.csv')
compound_pipeline.add_dataset('data/products.csv', path='../data/products.csv')

In [12]:
compound_pipeline.run()
print(compound_pipeline.list_store())
compound_pipeline.get_dataset('filter-aggregate-lookup-final-output-table').head()

['data/companies.csv', 'data/products.csv', 'filter-aggregate-lookup-final-output-table']


Unnamed: 0,ID,Name,Employees,US,Growth
0,1,Microsoft,160000,True,4.0
1,2,Apple,150000,True,2.0
2,3,Google,135000,True,
3,4,Amazon,1250000,True,
4,5,Samsung,270000,False,2.0


## Save pipeline Spec to Json

Note: When saving a pipeline, you save the specification and not the store, so you need to create and load the inputs into the store and run the pipeline again to recreate the results

In [13]:
aggregated_lookup.to_json('aggregate-lookup-save-test.json')

In [14]:
import json

with open('aggregate-lookup-save-test.json', 'r') as spec:
    pipeline_spec = json.loads(spec.read())


# Create pipeline
pipeline = Pipeline.from_json(pipeline_spec['steps'])

pipeline.add_dataset('companies', path='../data/companies.csv')
pipeline.add_dataset('products', path='../data/products.csv')

# Run pipeline
pipeline.run()

Unnamed: 0,ID,Name,Employees,US,Total sales >= 10k
0,1,Microsoft,160000,True,3206000
1,2,Apple,150000,True,4180000
2,3,Google,135000,True,660000
3,4,Amazon,1250000,True,38000
4,5,Samsung,270000,False,630000


In [15]:
lookup_1 = pipeline.get_dataset('lookup-1')
lookup_1

Unnamed: 0,ID,Name,Employees,US,Total sales >= 10k
0,1,Microsoft,160000,True,3206000
1,2,Apple,150000,True,4180000
2,3,Google,135000,True,660000
3,4,Amazon,1250000,True,38000
4,5,Samsung,270000,False,630000


In [16]:
# Test result is the same from the first execution (original pipeline creation)
from pandas.testing import assert_frame_equal

assert_frame_equal(lookup_1, lookup_final_result)

## Save store to a zip file

In [17]:
pipeline.save_store('./test.zip')

## Read store from zip file

In [18]:
new_pipeline = Pipeline()
new_pipeline.load_store('./test.zip')
new_pipeline.list_store()

['companies', 'products', 'join-1', 'filter-1', 'aggregate-1', 'lookup-1']