In [20]:
import sys
import os
os.chdir("/app")
from omegaconf import OmegaConf
from config import columns_namespace, pipeline_config
from etl_component.proto.etl_pipeline_pb2 import Alert, Match
from config import columns_namespace
from custom.ms.datatypes.field import InputRecordField
from etl_pipeline.custom.ms.payload_loader import PayloadLoader
from etl_pipeline.custom.ms.transformations import (
    create_agent_input_agg_col_config,
    prepend_agent_name_to_ap_or_wl_or_aliases_key,
)
from etl_pipeline.custom.ms.watchlist_extractor import WatchlistExtractor
from etl_pipeline.pipeline import ETLPipeline
import json
from etl_pipeline.data_processor_engine.json_engine.json_engine import JsonProcessingEngine


In [21]:


class MSPipeline(ETLPipeline):
    def convert_raw_to_standardized(self, df):
        return df

    def transform_standardized_to_cleansed(self, payload):
        payload = {key: payload[key] for key in sorted(payload)}
        payload = PayloadLoader().load_payload_from_json(payload)

        matches = payload["matchesPayloads"]
        parties = payload["alertPayload"]["supplementalInfo"]["parties"]
        payload["alertPayload"]["inputRecord"]["fields"] = {
            i["name"]: InputRecordField(**i)
            for i in payload["alertPayload"]["inputRecord"]["fields"]
        }
        fields = payload["alertPayload"]["inputRecord"]["fields"]

        for match in matches:
            WatchlistExtractor().update_match_with_wl_values(match)
            match[columns_namespace.TRIGGERED_BY] = self.engine.set_trigger_reasons(
                match, self.pipeline_config.FUZZINESS_LEVEL
            )
            self.engine.set_beneficiary_hits(match)
        self.engine.connect_full_names(parties)
        self.engine.collect_party_values(parties, payload)
        payload[columns_namespace.ALL_CONNECTED_PARTIES_NAMES] = payload[
            columns_namespace.ALL_PARTY_TYPES
        ]
        names_source_cols = [
            columns_namespace.ALL_PARTY_NAMES,
            columns_namespace.ALL_CONNECTED_PARTIES_NAMES,
        ]

        payload.update(
            {
                columns_namespace.CLEANED_NAMES: self.engine.get_clean_names_from_concat_name(
                    fields.get(fields.get(columns_namespace.CONCAT_ADDRESS, "")),
                    {key: payload[key] for key in names_source_cols},
                )
            }
        )
        payload.update(
            {
                columns_namespace.CONCAT_RESIDUE: payload[columns_namespace.CLEANED_NAMES][
                    columns_namespace.CONCAT_RESIDUE
                ]
            }
        )
        concat_residue = payload[columns_namespace.CONCAT_RESIDUE]
        concat_address = fields.get(
            fields.get(columns_namespace.CONCAT_ADDRESS, ""),
            "",
        )

        payload.update(
            {columns_namespace.CONCAT_ADDRESS_NO_CHANGES: concat_residue == concat_address}
        )
        for record in matches:
            record[columns_namespace.AP_TRIGGERS] = self.engine.set_triggered_tokens_discovery(
                payload, record, fields
            )

        return payload

    def get_key(self, payload, match_id, conf):
        new_config = {}
        for key, value in dict(conf).items():
            temp_dict = dict(value)
            for new_key in temp_dict:
                for element in temp_dict[new_key]:
                    elements = element.split(".")
                    if "matchesPayloads" in element:
                        value = payload["matchesPayloads"][match_id]
                        elements = elements[1:]
                    else:
                        value = payload
                    for field_name in elements:
                        try:
                            value = value.get(field_name, None)
                        except TypeError:
                            key = PayloadLoader.LIST_ELEMENT_REGEX.sub("", field_name)
                            ix = int(PayloadLoader.LIST_ELEMENT_REGEX.match(field_name).groups(0))
                            value = value[key][ix]
                    new_config[elements[-1]] = value
        return new_config

    def load_config(self, alert_type="WM_ADDRESS"):
        filenames = {"WM_ADDRESS": "config/agents_input_WM_ADDRESS.yaml"}
        import pdb; pdb.set_trace()
        yaml_conf = OmegaConf.load(filenames[alert_type])
        agent_config = {}
        for key, value in dict(yaml_conf).items():
            temp_dict = dict(value)
            party_config = {}
            agent_config[key] = party_config
            for new_key in temp_dict:
                party_config[new_key] = []

                for element in temp_dict[new_key]:
                    elements = element.split(".")
                    party_config[new_key].append(elements[-1])
        return agent_config, yaml_conf

    def transform_cleansed_to_application(self, payload):
        matches = payload["matchesPayloads"]

        agent_config, yaml_conf = self.load_config()
        agent_input_prepended_agent_name_config = prepend_agent_name_to_ap_or_wl_or_aliases_key(
            agent_config
        )

        agent_input_agg_col_config = create_agent_input_agg_col_config(
            agent_input_prepended_agent_name_config
        )

        for num, match in enumerate(matches):
            config = self.get_key(payload, num, yaml_conf)
            self.engine.sql_to_merge_specific_columns_to_standardized(
                agent_input_prepended_agent_name_config,
                match,
                config,
                False,
            )
            config.update(
                {
                    key: match.get(key)
                    for key in match
                    if key.endswith("_ap") or key.endswith("_wl")
                }
            )
            self.engine.sql_to_merge_specific_columns_to_standardized(
                agent_input_agg_col_config, match, config, True
            )
        return payload


In [22]:
def load_alert():
    with open("API/flat_response.json", "r") as f:
        text = json.load(f)
        match1 = Match(match_id="1", match_name="1")
        match2 = Match(match_id="2", match_name="2")
        alert = Alert(batch_id="1", alert_name="2", matches=[match1, match2])
        for key, value in text.items():
            alert.flat_payload[str(key)] = str(value)
    return alert

In [23]:
alert = load_alert()

In [24]:
engine = JsonProcessingEngine(pipeline_config)
pipeline = MSPipeline(engine, config=pipeline_config)

payload = load_alert()
payload = payload.flat_payload
payload = pipeline.transform_standardized_to_cleansed(payload)
payload = pipeline.transform_cleansed_to_application(payload)

with open("tests/shared/parsed_payload.pkl", "rb") as f:
    reference_payload = pickle.load(f)
for num in range(len(payload["matchesPayloads"])):
    for key in payload["matchesPayloads"][num]:
        try:
            assert (
                payload["matchesPayloads"][num][key]
                == reference_payload["matchesPayloads"][num][key]
            )
        except AssertionError:
            assert sorted(payload["matchesPayloads"][num][key]) == sorted(
                reference_payload["matchesPayloads"][num][key]
            )


> [0;32m/tmp/ipykernel_451928/1810233004.py[0m(89)[0;36mload_config[0;34m()[0m
[0;32m     87 [0;31m        [0mfilenames[0m [0;34m=[0m [0;34m{[0m[0;34m"WM_ADDRESS"[0m[0;34m:[0m [0;34m"config/agents_input_WM_ADDRESS.yaml"[0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     88 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 89 [0;31m        [0myaml_conf[0m [0;34m=[0m [0mOmegaConf[0m[0;34m.[0m[0mload[0m[0;34m([0m[0mfilenames[0m[0;34m[[0m[0malert_type[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     90 [0;31m        [0magent_config[0m [0;34m=[0m [0;34m{[0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     91 [0;31m        [0;32mfor[0m [0mkey[0m[0;34m,[0m [0mvalue[0m [0;32min[0m [0mdict[0m[0;34m([0m[0myaml_conf[0m[0;34m)[0m[0;34m.[0m[0mitems[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0

NameError: name 'pickle' is not defined

In [26]:
for match in payload["matchesPayloads"]:
    print("++++++++")
    print(match["nationality_agent_ap"], match["nationality_agent_wl"])
    print(match["residency_agent_ap"], match["residency_agent_wl"])
    print(match["dob_agent_ap"], match["dob_agent_wl"])

++++++++
None [[], [], [], None, None]
None None
['', '10/10/1969'] []
++++++++
None [['CHIC'], [], [], None, None]
None None
['', '10/10/1969'] ['01/11/1924']
