In [1]:
%%configure -f
{
    "conf":
    {
        "spark.sql.shuffle.partitions": 64,
        "spark.sql.broadcastTimeout": 14400,
        "spark.port.maxRetries": 100,
        "spark.executor.allowSparkContext": "true",
        "spark.executor.memory":"16g",
        "spark.driver.memory":"64g"
    }
}

StatementMeta(defaultSpark32, 0, -1, Finished, Available, Finished)

In [2]:
# Import from Whl

from typing import Dict
from transparency_engine.containers import ContainerKeys, build_container
from transparency_engine.io.data_handler import DataHandler, DataHandlerModes
from transparency_engine.pipeline import TransparencyPipeline
from transparency_engine.typing import PipelineSteps


StatementMeta(defaultSpark32, 0, 2, Finished, Available, Finished)



### Manually Update SubFolderpath for This Run

In [3]:
# MANUALLY UPDATE FOLDERPATH
subfolderpath = 'BeneficialOwnershipModel/TCU'
datecountry = 'TCU20240806'
storagename = 'hmxactaxdlconsmrdatadv'


StatementMeta(defaultSpark32, 0, 3, Finished, Available, Finished)

## Pipeline Configurations

In [4]:
#prep - input name, _fuzzy_match, _ prep
#individual_link_prediction  - _links
#individual_link_filtering -  _filtered_links, _filtered_graph
#macro_link_prediction - _links (macro__links)
#macro_link_filtering - _filtered_links (macro_filtered_links)
#scoring - entity_scoring, network_scoring
#report - entity_activity_report , entity_temporal_activity_report , entity_related_activity_report, entity_related_activity_overall_report, entity_activity_link_report, entity_attributes_report, 
# entity_graph_report,html_report, report_url
pipeline_config = {
    "name": "Transparency Engine Pipeline",
    "description": "Transparency Engine using TCU data",
    "storage": {
        "type": "hive",
        "root": f"BO_{datecountry}"
    },
    "steps": [
        "prep",
        "individual_link_prediction"
#        "individual_link_filtering"
#        "macro_link_prediction",
#        "macro_link_filtering",
#        "scoring",
#        "report"
    ]
}


StatementMeta(defaultSpark32, 0, 4, Finished, Available, Finished)

In [5]:
step_config = {
    "steps": {
        "prep": [
            {
                "name": "activity",
                "type": "dynamic",
                "path": f"abfss://curated@{storagename}.dfs.core.windows.net/{subfolderpath}/activity.csv",
                "steps": [
                    "load",
                    "preprocess"
                ],
                "config": {}
            },
            {
                "name": "contact",
                "type": "static",
                "path": f"abfss://curated@{storagename}.dfs.core.windows.net/{subfolderpath}/contact.csv",
                "steps": [
                    "load",  
                    "fuzzy_match",                  
                    "preprocess"
                ],
                "fuzzy_match_on": [
                    {
                        "name": "company_name",
                        "config": {
                            "min_similarity": 0.8
                        }
                    },
                    {
                        "name": "address",
                        "config": {
                            "min_similarity": 0.8
                        }
                    }
                ],
                "config": {}
            },
             {
                "name": "ownership",
                "type": "static",
                "path": f"abfss://curated@{storagename}.dfs.core.windows.net/{subfolderpath}/ownership.csv",
                "steps": [
                    "load",
                    "preprocess"
                ],
                "config": {}
            },
            {
                "name": "entity",
                "type": "entity",
                "path": f"abfss://curated@{storagename}.dfs.core.windows.net/{subfolderpath}/entityweight.csv",
                "steps": [
                    "load"
                ]
            },
            {
                "name": "entityReviewFlag",
                "type": "reviewflag",
                "path": f"abfss://curated@{storagename}.dfs.core.windows.net/{subfolderpath}/redflags.csv",
                "metadata": {
                    "type": "reviewflagmetadata",
                    "path": f"abfss://curated@{storagename}.dfs.core.windows.net/{subfolderpath}/redflagdefinition.csv"
                },
                "steps": [
                    "load",
                    "preprocess"
                ]
            },
            {
                "name": "attributeDefinition",
                "type": "metadata",
                "path": f"abfss://curated@{storagename}.dfs.core.windows.net/{subfolderpath}/attributedefinitions.csv",
                "steps": [
                    "load"
                ]
            }
        ],
        "individual_link_prediction": {
            "static": [
                {
                    "name": "contact",
                    "config": {
                        "min_weight": 0.01,
                        "min_similarity": 0.01,
                        "direct_link_min_weight": 0.01
                    }
                },
                {
                    "name": "ownership",
                    "config": {}
                }
            ],
            "dynamic": [
                {
                    "name": "activity",
                    "config": {
                        "min_weight": 1.0,
                        "sync_min_similarity": 0.5,
                        "async_min_similarity": 0.5,
                        "n_connected_components": 1,
                        "min_component_size": 1
                    }
                }
            ]
        },
        "individual_link_filtering": {
            "dynamic": [
                {
                    "name": "activity",
                    "config": {
                        "min_overall_similarity": 0.0,
                        "min_sync_similarity": 0.8,
                        "min_async_similarity": 0.8,
                        "sync_attributes": [
                            "tender",
                            "buyer",
                            "item"
                        ],
                        "async_attributes": [                            
                            "buyer",
                            "item"
                        ]
                    }
                }
            ]
        },
        "macro_link_prediction": {
            "name": "macro",
            "inputs": [
                "activity_filtered_links",
                "contact_links",
                "ownership_links"
            ],
            "config": {
                "min_weight": 0.1,
                "min_similarity": 0.1,
                "direct_link_min_weight": 0.1
            }
        },
        "macro_link_filtering": {
            "name": "macro",
            "static": [
                {
                    "name": "contact",
                    "config": {
                        "include_fuzzy_match": False
                    }
                },
                {
                    "name": "ownership",
                    "config": {}
                }
            ],
            "dynamic": [
                {
                    "name": "activity",
                    "config": {
                        "include_fuzzy_match": False
                    }
                }
            ],
            "config": {
                "max_path_length": 2
            }
        },
        "scoring": {
            "entity": "entity",
            "entity_flag": "entityReviewFlag",
            "flag_metadata": "entityReviewFlag_metadata",
            "predicted_links": "macro",
            "config": {}
        },
        "report": {
            "entity": "entity",
            "static": [
                {
                    "name": "contact",
                    "config": {}
                },
                 {
                    "name": "ownership",
                    "config": {}
                }

            ],
            "dynamic": [
                {
                    "name": "activity",
                    "config": {}
                }
            ],
            "other": [],
            "entity_flag": "entityReviewFlag",
            "network_score": "network_scoring",
            "predicted_links": "macro",
            "flag_metadata": "entityReviewFlag_metadata",
            "attribute_metadata": "attributeDefinition",
            "config": {
                "sync_attributes": [
                    "tender",
                    "buyer",
                    "item"
                ],
                "async_attributes": [
                   
                    "buyer",
                    "item"
                ],
                "entity_name_attribute": "name",
                "base_url": "https://traengine.eastus.cloudapp.azure.com/report/"
            }
        }
    }
}

StatementMeta(defaultSpark32, 0, 5, Finished, Available, Finished)

## Pipeline Execution

In [6]:
pipeline = TransparencyPipeline()

storage_config: Dict[str, str] = pipeline_config.get("storage", dict())

build_container(
    {
        ContainerKeys.STEP_CONFIG: step_config,
        ContainerKeys.PIPELINE_CONFIG: pipeline_config,
        ContainerKeys.DATA_HANDLER: (
            DataHandler,
            DataHandlerModes.from_string(storage_config.get("type", "")),
            storage_config.get("root", ""),
        ),
    },
    modules=["transparency_engine.pipeline"],
    packages=[],
)

StatementMeta(defaultSpark32, 0, 6, Finished, Available, Finished)

<dependency_injector.containers.DynamicContainer at 0x7fcc3b386be0>

In [7]:
steps = PipelineSteps.from_string_list(pipeline_config.get("steps", []))
pipeline.run(steps=steps)

StatementMeta(defaultSpark32, 0, 7, Finished, Available, Finished)



+--------------------+--------------------+------------------+------------+
|              Source|              Target|        Similarity| AttributeID|
+--------------------+--------------------+------------------+------------+
|LWL SOLUCOES INTE...|RD SOLUCOES INTEG...|0.8260869565217391|company_name|
|FT EMPREENDIMENTO...|SEJA EMPREENDIMEN...|0.8461538461538461|company_name|
|BH SOLUCOES INTEG...|LWL SOLUCOES INTE...|0.8260869565217391|company_name|
|  DCG COMERCIAL LTDA|  ILG COMERCIAL LTDA|0.8571428571428571|company_name|
|BEMM SOLUCOES INT...|M2R SOLUCOES INTE...|0.8260869565217391|company_name|
+--------------------+--------------------+------------------+------------+
only showing top 5 rows

+--------------+---------+------+
|          Node|Partition|NodeId|
+--------------+---------+------+
| 1000823000180| EntityID|     0|
|10013974000163| EntityID|     1|
|10014760000101| EntityID|     2|
|10014999000181| EntityID|     3|
|10015379000167| EntityID|     4|
|10015889000134| En

MemoryError: 