In [1]:
import os

os.environ["CONFIG_APP_DIR"] = "config_xml"

In [2]:
import sys
import os
import pickle
os.chdir("..")
from omegaconf import OmegaConf
from etl_pipeline.config import columns_namespace, pipeline_config
from service.proto.etl_pipeline_pb2 import Alert, Match
from etl_pipeline.config import columns_namespace
from etl_pipeline.custom.ms.datatypes.field import InputRecordField
from etl_pipeline.custom.ms.payload_loader import PayloadLoader
from etl_pipeline.custom.ms.transformations import (
    create_agent_input_agg_col_config,
    prepend_agent_name_to_ap_or_wl_or_aliases_key,
)
from etl_pipeline.custom.ms.watchlist_extractor import WatchlistExtractor
from etl_pipeline.pipeline import ETLPipeline
from pipelines.ms.json_pipeline import MSPipeline

import json
from etl_pipeline.data_processor_engine.json_engine.json_engine import JsonProcessingEngine




In [3]:
from etl_pipeline.config import columns_namespace as cn

In [4]:
def load_alert():
    with open("notebooks/sample/alert.json", "r") as f:
        text = json.loads(f.read())
        match1 = Match(match_id="0", match_name="1")
        match2 = Match(match_id="1", match_name="2")
        alert = Alert(batch_id="1", alert_name="2", matches=[match1, match2])
        for key, value in text.items():
            alert.flat_payload[str(key)] = value
    return alert

In [5]:
# alert = load_alert()

In [6]:
# payload = load_alert()


# Definition

In [7]:
from etl_pipeline.config import agents_config
from etl_pipeline.config import columns_namespace as cn
from etl_pipeline.custom.ms.datatypes.field import InputRecordField
from etl_pipeline.custom.ms.payload_loader import PayloadLoader
from etl_pipeline.custom.ms.transformations import (
    create_agent_input_agg_col_config,
    prepend_agent_name_to_ap_or_wl_or_aliases_key,
)
from etl_pipeline.custom.ms.watchlist_extractor import WatchlistExtractor
from etl_pipeline.pipeline import ETLPipeline


class MSPipeline(ETLPipeline):
    def convert_raw_to_standardized(self, df):
        return df

    def transform_standardized_to_cleansed(self, payload):
        match_ids = payload[cn.MATCH_IDS]
        matches = payload[cn.ALERT_FIELD][cn.MATCH_RECORDS]
        parties = payload[cn.ALERT_FIELD][cn.SUPPLEMENTAL_INFO][cn.PARTIES]
        payload[cn.ALERT_FIELD][cn.INPUT_RECORD_HIST][cn.INPUT_RECORD][cn.FIELDS] = {
            i["name"]: InputRecordField(**i)
            for i in payload[cn.ALERT_FIELD][cn.INPUT_RECORD_HIST][cn.INPUT_RECORD][cn.FIELDS]
        }
        fields = payload[cn.ALERT_FIELD][cn.INPUT_RECORD_HIST][cn.INPUT_RECORD][cn.FIELDS]

        for match_id in match_ids:
            match = matches[match_id]
            WatchlistExtractor().update_match_with_wl_values(match)
            match[cn.TRIGGERED_BY] = self.engine.set_trigger_reasons(
                match, self.pipeline_config.FUZZINESS_LEVEL
            )
            self.engine.set_beneficiary_hits(match)
        self.engine.connect_full_names(parties)
        self.engine.collect_party_values(parties, payload)
        payload[cn.ALL_CONNECTED_PARTY_TYPES] = payload[cn.ALL_PARTY_TYPES]
        names_source_cols = [
            cn.ALL_PARTY_NAMES,
            cn.ALL_CONNECTED_PARTIES_NAMES,
        ]

        payload.update(
            {
                cn.CLEANED_NAMES: self.engine.get_clean_names_from_concat_name(
                    fields.get(fields.get(cn.CONCAT_ADDRESS, "")),
                    {key: payload[key] for key in names_source_cols},
                )
            }
        )
        payload.update({cn.CONCAT_RESIDUE: payload[cn.CLEANED_NAMES][cn.CONCAT_RESIDUE]})
        concat_residue = payload[cn.CONCAT_RESIDUE]
        concat_address = fields.get(
            fields.get(cn.CONCAT_ADDRESS, ""),
            "",
        )

        payload.update({cn.CONCAT_ADDRESS_NO_CHANGES: concat_residue == concat_address})
        for match_id in match_ids:
            match = matches[match_id]
            match[cn.AP_TRIGGERS] = self.engine.set_triggered_tokens_discovery(
                payload, match, fields
            )

        return payload

    def get_key(self, payload, match, conf):
        new_config = {}
        for key, value in dict(conf).items():
            temp_dict = dict(value)
            for new_key in temp_dict:
                for element in temp_dict[new_key]:
                    elements = element.split(".")
                    if cn.MATCH_RECORDS in element:
                        value = match
                        elements = elements[1:]
                    else:
                        value = payload
                    
                    for field_name in elements:
                        if field_name == "INPUT_RECORD_FIELD":
                            value = value[0][field_name][elements[-1]].value
                            break
                        try:
                            value = value.get(field_name, None)
                        except TypeError:
                            key = PayloadLoader.LIST_ELEMENT_REGEX.sub("", field_name)
                            ix = int(PayloadLoader.LIST_ELEMENT_REGEX.match(field_name).groups(0))
                            value = value[key][ix]
                    new_config[elements[-1]] = value
        return new_config

    def load_config(self, alert_type="WM_ADDRESS"):
        yaml_conf = agents_config[alert_type]
        agent_config = {}
        for key, value in dict(yaml_conf).items():
            temp_dict = dict(value)
            party_config = {}
            agent_config[key] = party_config
            for new_key in temp_dict:
                party_config[new_key] = []

                for element in temp_dict[new_key]:
                    elements = element.split(".")
                    party_config[new_key].append(elements[-1])
        return agent_config, yaml_conf

    def transform_cleansed_to_application(self, payload):
        match_ids = payload[cn.MATCH_IDS]
        matches = payload[cn.ALERT_FIELD][cn.MATCH_RECORDS]
        agent_config, yaml_conf = self.load_config()
        agent_input_prepended_agent_name_config = prepend_agent_name_to_ap_or_wl_or_aliases_key(
            agent_config
        )

        agent_input_agg_col_config = create_agent_input_agg_col_config(
            agent_input_prepended_agent_name_config
        )

        for match_id in match_ids:
            match = matches[match_id]
            config = self.get_key(payload, match, yaml_conf)
            self.engine.sql_to_merge_specific_columns_to_standardized(
                agent_input_prepended_agent_name_config,
                match,
                config,
                False,
            )
            config.update(
                {
                    key: match.get(key)
                    for key in match
                    if key.endswith("_ap") or key.endswith("_wl")
                }
            )
            self.engine.sql_to_merge_specific_columns_to_standardized(
                agent_input_agg_col_config, match, config, True
            )
        return payload


In [8]:
with open(f'notebooks/sample/alert.json', 'r') as file:
    payload = json.loads(file.read())

payload_json = {key: payload[key] for key in sorted(payload)}
# payload_json = PayloadLoader().load_payload_from_json(payload_json)
payload_json['match_ids'] = [i for i in range(len(payload_json[cn.ALERT_FIELD][cn.MATCH_RECORDS]))]
payload = payload_json

In [9]:
engine = JsonProcessingEngine(pipeline_config)
pipeline = MSPipeline(engine, config=pipeline_config)

## transform standardized to cleansed

In [10]:
match_ids = payload[cn.MATCH_IDS]
matches = payload[cn.ALERT_FIELD][cn.MATCH_RECORDS]

In [11]:
parties = payload[cn.SUPPLEMENTAL_INFO][cn.RELATED_PARTIES][cn.PARTIES]

In [12]:
for num, party in enumerate(parties):
    parties[num] = party["fields"]

In [13]:
payload[cn.ALERT_FIELD][cn.INPUT_RECORD_HIST][0]["INPUT_RECORD_FIELD"] = {
    i["name"]: InputRecordField(**i)
    for i in payload[cn.ALERT_FIELD][cn.INPUT_RECORD_HIST][0]["field"]
}

fields = payload[cn.ALERT_FIELD][cn.INPUT_RECORD_HIST][0]["INPUT_RECORD_FIELD"]

In [14]:
for match_id in match_ids:
    match = matches[match_id]
    WatchlistExtractor().update_match_with_wl_values(match)
    match[cn.TRIGGERED_BY] = engine.set_trigger_reasons(
        match, pipeline_config.FUZZINESS_LEVEL
    )
    engine.set_beneficiary_hits(match)

In [15]:
engine.connect_full_names(parties)


In [16]:

engine.collect_party_values(parties, payload)
payload[cn.ALL_CONNECTED_PARTY_TYPES] = payload[cn.ALL_PARTY_TYPES]
names_source_cols = [
    cn.ALL_PARTY_NAMES,
    cn.ALL_CONNECTED_PARTIES_NAMES,
]

In [17]:
fields.get(cn.CONCAT_ADDRESS, "")

InputRecordField(name='CONCAT_ADDRESS', isScreenable='true', value='Joe Black & Jane Doe 111 A 11TH ST AAA 1A New Jersey NJ', sortOrder='13')

In [18]:
fields.get(cn.CONCAT_ADDRESS, {})

InputRecordField(name='CONCAT_ADDRESS', isScreenable='true', value='Joe Black & Jane Doe 111 A 11TH ST AAA 1A New Jersey NJ', sortOrder='13')

In [19]:
from dataclasses import asdict
payload.update(
    {
        cn.CLEANED_NAMES: engine.get_clean_names_from_concat_name(
            fields.get(cn.CONCAT_ADDRESS, None).value,
            {key: payload[key] for key in names_source_cols},
        )
    }
)

In [20]:
payload.update({cn.CONCAT_RESIDUE: payload[cn.CLEANED_NAMES][cn.CONCAT_RESIDUE]})


In [21]:

concat_residue = payload[cn.CONCAT_RESIDUE]
concat_address = fields.get(cn.CONCAT_ADDRESS, None).value

In [22]:
payload.update({cn.CONCAT_ADDRESS_NO_CHANGES: concat_residue == concat_address})
for match_id in match_ids:
    match = matches[match_id]
    match[cn.AP_TRIGGERS] = engine.set_triggered_tokens_discovery(
        payload, match, fields
    )


In [23]:
match[cn.WL_MATCHED_TOKENS]

'["Doe"]'

In [24]:
match_ids = payload[cn.MATCH_IDS]
matches = payload[cn.ALERT_FIELD][cn.MATCH_RECORDS]
agent_config, yaml_conf = pipeline.load_config()
agent_input_prepended_agent_name_config = prepend_agent_name_to_ap_or_wl_or_aliases_key(
    agent_config
)

agent_input_agg_col_config = create_agent_input_agg_col_config(
    agent_input_prepended_agent_name_config
)


In [25]:
for match_id in match_ids:
    match = matches[match_id]
    config = pipeline.get_key(payload, match, yaml_conf)
    pipeline.engine.sql_to_merge_specific_columns_to_standardized(
        agent_input_prepended_agent_name_config,
        match,
        config,
        False,
    )
    config.update(
        {
            key: match.get(key)
            for key in match
            if key.endswith("_ap") or key.endswith("_wl")
        }
    )
    pipeline.engine.sql_to_merge_specific_columns_to_standardized(
        agent_input_agg_col_config, match, config, True
    )


### payload

In [26]:
# partyType

In [27]:
for match in matches:
    print("++++++++")
    print(match["ap_all_dobs_aggregated"], match["wl_all_dobs_aggregated"])
    # print(match["ap_all_nationalities_aggregated"], match["wl_all_nationalities_aggregated"])
    print(match["ap_all_residencies_aggregated"], match["wl_all_residencies_aggregated"], match['WL_COUNTRY'])


++++++++
[['02/31/1900', '04/31/1910']] [['MAY 6, 1981']]
['UNITED STATES OF AMERICA'] ['US'] US
++++++++
[['02/31/1900', '04/31/1910']] [['MAY 6, 1981']]
['UNITED STATES OF AMERICA'] ['US'] US


In [28]:
import pickle
with open("tests/shared/parsed_payload.pkl", "wb") as f:
    pickle.dump(payload, f)