In [1]:
%%configure -f
{
    "conf":
    {
        "spark.sql.shuffle.partitions": 64,
        "spark.sql.broadcastTimeout": 14400,
        "spark.port.maxRetries": 100,
        "spark.executor.allowSparkContext": "true"

    }
}

StatementMeta(, 4, -1, Finished, Available)

In [2]:
# Import from Whl

from typing import Dict
from transparency_engine.containers import ContainerKeys, build_container
from transparency_engine.io.data_handler import DataHandler, DataHandlerModes
from transparency_engine.pipeline import TransparencyPipeline
from transparency_engine.typing import PipelineSteps


StatementMeta(defaultSpark32, 4, 2, Finished, Available)



### Manually Update SubFolderpath for This Run

In [3]:
# MANUALLY UPDATE FOLDERPATH
subfolderpath = 'BeneficialOwnershipModel/2023-09-24'
datecountry = 'DR20230924'
storagename = 'beowneu2datalake04'


StatementMeta(defaultSpark32, 4, 3, Finished, Available)

## Pipeline Configurations

In [4]:
pipeline_config = {
    "name": "Transparency Engine Pipeline",
    "description": "Transparency Engine using open or customer data",
    "storage": {
        "type": "hive",
        "root": f"BeneficialOwnership{datecountry}"
    },
    "steps": [
        "prep",
        "individual_link_prediction",
        "individual_link_filtering",
        "macro_link_prediction",
        "macro_link_filtering",
        "scoring",
        "report"
    ]
}


StatementMeta(defaultSpark32, 4, 4, Finished, Available)

In [5]:
step_config = {
    "steps": {
        "prep": [
            {
                "name": "activity",
                "type": "dynamic",
                "path": f"abfss://curated@{storagename}.dfs.core.windows.net/{subfolderpath}/activity.csv",
                "steps": [
                    "load",
                    "preprocess"
                ],
                "config": {}
            },
            {
                "name": "contact",
                "type": "static",
                "path": f"abfss://curated@{storagename}.dfs.core.windows.net/{subfolderpath}/contact.csv",
                "steps": [
                    "load",
                    "fuzzy_match",
                    "preprocess"
                ],
                "fuzzy_match_on": [
                    {
                        "name": "name",
                        "config": {
                            "min_similarity": 0.8
                        }
                    },
                    {
                        "name": "address",
                        "config": {
                            "min_similarity": 0.8
                        }
                    }
                ],
                "config": {}
            },
            
            {
                "name": "entity",
                "type": "entity",
                "path": f"abfss://curated@{storagename}.dfs.core.windows.net/{subfolderpath}/entityweight.csv",
                "steps": [
                    "load"
                ]
            },
            {
                "name": "entityReviewFlag",
                "type": "reviewflag",
                "path": f"abfss://curated@{storagename}.dfs.core.windows.net/{subfolderpath}/entityredflag.csv",
                "metadata": {
                    "type": "reviewflagmetadata",
                    "path": f"abfss://curated@{storagename}.dfs.core.windows.net/{subfolderpath}/redflagdefinition.csv"
                },
                "steps": [
                    "load",
                    "preprocess"
                ]
            },
            {
                "name": "attributeDefinition",
                "type": "metadata",
                "path": f"abfss://curated@{storagename}.dfs.core.windows.net/{subfolderpath}/attributedefinition.csv",
                "steps": [
                    "load"
                ]
            }
        ],
        "individual_link_prediction": {
            "static": [
                {
                    "name": "contact",
                    "config": {
                        "min_weight": 0.01,
                        "min_similarity": 0.01,
                        "direct_link_min_weight": 0.01
                    }
                }
            ],
            "dynamic": [
                {
                    "name": "activity",
                    "config": {
                        "min_weight": 1.0,
                        "sync_min_similarity": 0.5,
                        "async_min_similarity": 0.5,
                        "n_connected_components": 100,
                        "min_component_size": 5
                    }
                }
            ]
        },
        "individual_link_filtering": {
            "dynamic": [
                {
                    "name": "activity",
                    "config": {
                        "min_overall_similarity": 0.0,
                        "min_sync_similarity": 0.8,
                        "min_async_similarity": 0.8,
                        "sync_attributes": [
                            "tender",
                            "buyer",
                            "item"
                        ],
                        "async_attributes": [
                            
                            "item"
                        ]
                    }
                }
            ]
        },
        "macro_link_prediction": {
            "name": "macro",
            "inputs": [
                "activity_filtered_links",
                "contact_links"
            ],
            "config": {
                "min_weight": 0.1,
                "min_similarity": 0.1,
                "direct_link_min_weight": 0.1
            }
        },
        "macro_link_filtering": {
            "name": "macro",
            "static": [
                {
                    "name": "contact",
                    "config": {
                        "include_fuzzy_match": True
                    }
                }
            ],
            "dynamic": [
                {
                    "name": "activity",
                    "config": {
                        "include_fuzzy_match": False
                    }
                }
            ],
            "config": {
                "max_path_length": 5
            }
        },
        "scoring": {
            "entity": "entity",
            "entity_flag": "entityReviewFlag",
            "flag_metadata": "entityReviewFlag_metadata",
            "predicted_links": "macro",
            "config": {}
        },
        "report": {
            "entity": "entity",
            "static": [
                {
                    "name": "contact",
                    "config": {}
                }

            ],
            "dynamic": [
                {
                    "name": "activity",
                    "config": {}
                }
            ],
            "other": [],
            "entity_flag": "entityReviewFlag",
            "network_score": "network_scoring",
            "predicted_links": "macro",
            "flag_metadata": "entityReviewFlag_metadata",
            "attribute_metadata": "attributeDefinition",
            "config": {
                "sync_attributes": [
                    "item"
                ],
                "async_attributes": [
                   
                    "item"
                ],
                "entity_name_attribute": "name",
                "base_url": "http://localhost:3000/report/"
            }
        }
    }
}

StatementMeta(defaultSpark32, 4, 5, Finished, Available)

## Pipeline Execution

In [6]:
pipeline = TransparencyPipeline()

storage_config: Dict[str, str] = pipeline_config.get("storage", dict())

build_container(
    {
        ContainerKeys.STEP_CONFIG: step_config,
        ContainerKeys.PIPELINE_CONFIG: pipeline_config,
        ContainerKeys.DATA_HANDLER: (
            DataHandler,
            DataHandlerModes.from_string(storage_config.get("type", "")),
            storage_config.get("root", ""),
        ),
    },
    modules=["transparency_engine.pipeline"],
    packages=[],
)

StatementMeta(defaultSpark32, 4, 6, Finished, Available)

<dependency_injector.containers.DynamicContainer at 0x7f5412c886d0>

In [7]:
steps = PipelineSteps.from_string_list(pipeline_config.get("steps", []))
pipeline.run(steps=steps)

StatementMeta(defaultSpark32, 4, 7, Finished, Available)

{'name': LSHConfig(data_partitions=640, include_word_delimiter=True, ngram_length=4, num_hash_tables=3, min_df=0.0001, max_df=0.1, min_similarity=0.8), 'address': LSHConfig(data_partitions=640, include_word_delimiter=True, ngram_length=4, num_hash_tables=3, min_df=0.0001, max_df=0.1, min_similarity=0.8)}
+--------------------+--------------------+------------------+-----------+
|              Source|              Target|        Similarity|AttributeID|
+--------------------+--------------------+------------------+-----------+
|CNE ENVIRONMENTAL...|CNE ENVIRONMENTAL...|0.9444444444444444|       name|
|MR. PATRICK ALOZI...|MR. PATRICK ALOZI...|0.9230769230769231|       name|
|A.G. VISION CONST...|A.G. VISION CONST...|0.8518518518518519|       name|
|CNE ENVIRONMENTAL...|CNE ENVIRONMENTAL...|0.8181818181818182|       name|
|GITTO COSTRUZIONI...|GITTO COSTRUZIONI...|0.8064516129032258|       name|
+--------------------+--------------------+------------------+-----------+
only showing top 5 



PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/home/trusted-service-user/cluster-env/clonedenv/lib/python3.8/site-packages/transparency_engine/analysis/scoring/entity_scoring.py", line 132, in <lambda>
    lambda measure_values: compute_score(measure_values, configs), FloatType()
  File "/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/transparency_engine/analysis/scoring/measures.py", line 338, in compute_score
    float(
TypeError: float() argument must be a string or a number, not 'NoneType'
