In [1]:
%%configure -f
{
    "conf":
    {
        "spark.sql.shuffle.partitions": 16,
        "spark.sql.broadcastTimeout": 14400,
        "spark.port.maxRetries": 100

    }
}

In [2]:
from typing import Dict
from transparency_engine.containers import ContainerKeys, build_container
from transparency_engine.io.data_handler import DataHandler, DataHandlerModes
from transparency_engine.pipeline import TransparencyPipeline
from transparency_engine.typing import PipelineSteps

## Pipeline Configurations

In [3]:
#Storage Config
import json
storageLinkedService = 'LS_DataLake'
storageAccount_ls = mssparkutils.credentials.getPropertiesAll(storageLinkedService)
storage_account = json.loads(storageAccount_ls)['Endpoint'].split('.')[0].replace('https://','')

In [4]:
pipeline_config = {
    "name": "Transparency Engine Pipeline",
    "description": "Demo of the Transparency Engine using synthetic data",
    "storage": {
        "type": "hive",
        "root": "TE_GeneratedData"
    },
    "steps": [
        "prep",
        "individual_link_prediction",
        "individual_link_filtering",
        "macro_link_prediction",
        "macro_link_filtering",
        "scoring",
        "report"
    ]
}

In [5]:
step_config = {
    "steps": {
        "prep": [
            {
                "name": "activity",
                "type": "dynamic",
                "path": "abfss://curated@"+storage_account+".dfs.core.windows.net/BeneficialOwnershipModel/GeneratedData/CSV/activity.csv",
            
                "steps": [
                    "load",
                    "preprocess"
                ],
                "config": {}
            },
            {
                "name": "contact",
                "type": "static",
                "path": "abfss://curated@"+storage_account+".dfs.core.windows.net/BeneficialOwnershipModel/GeneratedData/CSV/contact.csv",
                "steps": [
                    "load",
                    "preprocess"
                ],
                "fuzzy_match_on": [
                    "name",
                    "address"
                ],
                "config": {}
            },
            {
                "name": "ownership",
                "type": "static",
                "path": "abfss://curated@"+storage_account+".dfs.core.windows.net/BeneficialOwnershipModel/GeneratedData/CSV/ownership.csv",
                "steps": [
                    "load",
                    "preprocess"
                ],
                "config": {}
            },
            {
                "name": "entity",
                "type": "entity",
                "path": "abfss://curated@"+storage_account+".dfs.core.windows.net/BeneficialOwnershipModel/GeneratedData/CSV/entity.csv",
                "steps": [
                    "load"
                ]
            },
            {
                "name": "entityReviewFlag",
                "type": "reviewflag",
                "path": "abfss://curated@"+storage_account+".dfs.core.windows.net/BeneficialOwnershipModel/GeneratedData/CSV/redflag.csv",
                "metadata": {
                    "type": "reviewflagmetadata",
                    "path": "abfss://curated@"+storage_account+".dfs.core.windows.net/BeneficialOwnershipModel/GeneratedData/CSV/flag_definitions.csv"
                },
                "steps": [
                    "load",
                    "preprocess"
                ]
            },
            {
                "name": "attributeDefinition",
                "type": "metadata",
                "path": "abfss://curated@"+storage_account+".dfs.core.windows.net/BeneficialOwnershipModel/GeneratedData/CSV/attribute_definitions.csv",
                "steps": [
                    "load"
                ]
            }
        ],
        "individual_link_prediction": {
            "static": [
                {
                    "name": "contact",
                    "config": {}
                },
                {
                    "name": "ownership",
                    "config": {}
                }
            ],
            "dynamic": [
                {
                    "name": "activity",
                    "config": {}
                }
            ]
        },
        "individual_link_filtering": {
            "dynamic": [
                {
                    "name": "activity",
                    "config": {
                        "sync_attributes": [
                            "tender",
                            "buyer",
                            "item"
                        ],
                        "async_attributes": [
                            "buyer",
                            "item"
                        ]
                    }
                }
            ]
        },
        "macro_link_prediction": {
            "name": "macro",
            "inputs": [
                "activity_filtered_links",
                "contact_links",
                "ownership_links"
            ]
        },
        "macro_link_filtering": {
            "name": "macro",
            "static": [
                {
                    "name": "contact",
                    "config": {}
                },
                {
                    "name": "ownership",
                    "config": {}
                }
            ],
            "dynamic": [
                {
                    "name": "activity",
                    "config": {}
                }
            ]
        },
        "scoring": {
            "entity": "entity",
            "entity_flag": "entityReviewFlag",
            "flag_metadata": "entityReviewFlag_metadata",
            "predicted_links": "macro",
            "config": {}
        },
        "report": {
            "entity": "entity",
            "static": [
                {
                    "name": "contact",
                    "config": {}
                },
                {
                    "name": "ownership",
                    "config": {}
                }

            ],
            "dynamic": [
                {
                    "name": "activity",
                    "config": {}
                }
            ],
            "other": [],
            "entity_flag": "entityReviewFlag",
            "network_score": "network_scoring",
            "predicted_links": "macro",
            "flag_metadata": "entityReviewFlag_metadata",
            "attribute_metadata": "attributeDefinition",
            "config": {
                "sync_attributes": [
                    "tender",
                    "buyer",
                    "item"
                ],
                "async_attributes": [
                    "buyer",
                    "item"
                ],
                "entity_name_attribute": "company_name"
            }
        }
    }
}

## Pipeline Execution

In [6]:
pipeline = TransparencyPipeline()

storage_config: Dict[str, str] = pipeline_config.get("storage", dict())

build_container(
    {
        ContainerKeys.STEP_CONFIG: step_config,
        ContainerKeys.PIPELINE_CONFIG: pipeline_config,
        ContainerKeys.DATA_HANDLER: (
            DataHandler,
            DataHandlerModes.from_string(storage_config.get("type", "")),
            storage_config.get("root", ""),
        ),
    },
    modules=["transparency_engine.pipeline"],
    packages=[],
)

In [None]:
steps = PipelineSteps.from_string_list(pipeline_config.get("steps", []))
pipeline.run(steps=steps)