In [1]:
import os
import glob

os.environ["CONFIG_APP_DIR"] = "tests/test_custom/config_app/"

In [2]:
import sys
import pickle
os.chdir("..")
from omegaconf import OmegaConf
from etl_pipeline.config import pipeline_config
from etl_pipeline.custom.ms.payload_loader import PayloadLoader

import json
from etl_pipeline.data_processor_engine.json_engine.json_engine import JsonProcessingEngine

In [3]:
from etl_pipeline.config import columns_namespace as cn

In [4]:
ETL_SAMPLES_PATH = '/v/region/na/appl/itlnc/s8/data/uat/silent8/name-screening/etl_samples/complete_info/'

In [5]:
DATASET_NAME = 'WM' # 'ISG_Daily' # 'WM' # 'ISG_Weekly 
DATASET_TYPE =  'Address' # 'Address' # Party # Account
DATASET = DATASET_NAME + '_' + DATASET_TYPE

In [6]:
complete_info = glob.glob(ETL_SAMPLES_PATH + DATASET + '/*.json')

In [11]:
from etl_pipeline.config import columns_namespace as cn
from etl_pipeline.custom.ms.transformations import (
    create_agent_input_agg_col_config,
    prepend_agent_name_to_ap_or_wl_or_aliases_key,
)
from etl_pipeline.custom.ms.watchlist_extractor import WatchlistExtractor
from pipelines.ms.ms_pipeline import MSPipeline as WMPipeline


class MSPipeline(WMPipeline):
    def transform_standardized_to_cleansed(self, payloads):
        parties = payloads[cn.SUPPLEMENTAL_INFO][cn.RELATED_PARTIES][cn.PARTIES]
        self.flatten_parties(parties)
        self.parse_input_records(payloads)

        payloads = self.connect_input_record_with_match_record(payloads)

        for payload in payloads:
            matches = payload[cn.ALERTED_PARTY_FIELD][cn.MATCH_RECORDS]
            fields = payload[cn.ALERTED_PARTY_FIELD][cn.INPUT_RECORD_HIST][0]["INPUT_FIELD"]
            for match in matches:
                WatchlistExtractor().update_match_with_wl_values(match)
                match[cn.TRIGGERED_BY] = self.engine.set_trigger_reasons(
                    match, self.pipeline_config.FUZZINESS_LEVEL
                )
                self.engine.set_beneficiary_hits(match)

            self.engine.connect_full_names(parties)

            self.engine.collect_party_values_from_parties(parties, payload)
            self.engine.collect_party_values_from_parties_from_fields(fields, payload)
            payload[cn.ALL_CONNECTED_PARTY_TYPES] = payload[cn.ALL_PARTY_TYPES]
            names_source_cols = [
                cn.ALL_PARTY_NAMES,
                cn.ALL_CONNECTED_PARTIES_NAMES,
            ]

            payload.update(
                {
                    cn.CLEANED_NAMES: self.engine.get_clean_names_from_concat_name(
                        self.engine.get_field_value_name(fields, cn.CONCAT_ADDRESS),
                        {key: payload[key] for key in names_source_cols},
                    )
                }
            )

            payload.update({cn.CONCAT_RESIDUE: payload[cn.CLEANED_NAMES][cn.CONCAT_RESIDUE]})

            concat_residue = payload[cn.CONCAT_RESIDUE]
            concat_address = self.engine.get_field_value_name(fields, cn.CONCAT_ADDRESS)

            payload.update({cn.CONCAT_ADDRESS_NO_CHANGES: concat_residue == concat_address})
            for match in matches:
                match[cn.AP_TRIGGERS] = self.engine.set_triggered_tokens_discovery(
                    payload, match, fields
                )
        return payloads

    def transform_cleansed_to_application(self, payloads):
        for payload in payloads:
            matches = payload[cn.ALERTED_PARTY_FIELD][cn.MATCH_RECORDS]
            agent_config, yaml_conf = self.load_agent_config(
                payload[cn.ALERTED_PARTY_FIELD]["headerInfo"]["datasetName"]
            )
            agent_input_prepended_agent_name_config = (
                prepend_agent_name_to_ap_or_wl_or_aliases_key(agent_config)
            )

            agent_input_agg_col_config = create_agent_input_agg_col_config(
                agent_input_prepended_agent_name_config
            )

            for match in matches:
                config = self.get_key(payload, match, yaml_conf)
                self.engine.sql_to_merge_specific_columns_to_standardized(
                    agent_input_prepended_agent_name_config,
                    match,
                    config,
                    False,
                )
                match.update(
                    {
                        key: self.flatten(match.get(key))
                        for key in match
                        if key.endswith("_ap") or key.endswith("_wl")
                    }
                )

                config.update(
                    {
                        key: self.flatten(match.get(key))
                        for key in match
                        if key.endswith("_ap") or key.endswith("_wl")
                    }
                )

                self.engine.sql_to_merge_specific_columns_to_standardized(
                    agent_input_agg_col_config, match, config, False
                )
                self.remove_nulls_from_aggegated(match)
                
                match.update(
                    {
                        key: self.flatten(match.get(key))
                        for key in match
                        if key.endswith("_aggregated")
                    }
                )
                self.remove_nulls_from_aggegated(match)
        return payloads
wm_address_in_payload_format

### Testing flow

In [12]:
# from pipelines.ms.wm_party_pipeline import MSPipeline

engine = JsonProcessingEngine(pipeline_config)
pipeline = MSPipeline(engine, config=pipeline_config)

In [33]:
with open(f'notebooks/sample/wm_party_in_payload_format.json', 'r') as file:
    payload = json.loads(file.read())

payload_json = {key: payload[key] for key in sorted(payload)}
payload_json = PayloadLoader().load_payload_from_json(payload_json)
payload_json = payload_json['alertPayload']


payload_json['match_ids'] = [i for i in range(len(payload_json[cn.ALERTED_PARTY_FIELD][cn.MATCH_RECORDS]))]


In [34]:
payload = pipeline.transform_standardized_to_cleansed(payload_json)
new_payloads = pipeline.transform_cleansed_to_application(payload)

In [35]:
import pandas as pd

In [36]:
out_payload = pd.DataFrame([match for payload in new_payloads   for match in payload['alert']['matchRecords']])

In [37]:
out_payload[[i for i in out_payload.columns  if i.endswith("_aggregated")]]

Unnamed: 0,ap_all_dobs_aggregated,wl_all_dobs_aggregated,ap_all_documents_aggregated,wl_all_documents_aggregated,ap_all_employer_names_aggregated,wl_all_employer_names_aggregated,ap_all_names_aggregated,wl_all_names_aggregated,ap_all_party_types_aggregated,wl_all_party_types_aggregated,ap_all_pobs_aggregated,wl_all_pobs_aggregated,ap_all_residencies_aggregated,wl_all_residencies_aggregated,ap_all_santioned_countries_aggregated,wl_all_santioned_countries_aggregated
0,"[02/31/1900, 04/31/1910]","[MAY 6, 1981]","[154421273, 022368917]",[],[],[Joe Ding],"[Pladimir Vutin, Eva Pladimirova]",[Joe Ding],"[Individual, Individual]",[],[],"[US, United States of America, Joe Ding]","[United States, United States]","[US, United States of America]",[],"[US, United States of America, Joe Ding]"
1,"[02/31/1900, 04/31/1910]","[MAY 6, 1981]","[154421273, 022368917]",[],[],[Joe Dong],"[Pladimir Vutin, Eva Pladimirova]",[Joe Dong],"[Individual, Individual]",[],[],"[PL, Poland, Joe Dong]","[United States, United States]","[PL, Poland]",[],"[PL, Poland, Joe Dong]"


# check all test jsons - You need to customize it - please copy paste code from the above class definition

In [None]:
from tqdm import tqdm
for info in tqdm(complete_info):
    # print('\n' + os.path.split(info)[1])
    with open(info, 'r') as file:
        payload = json.load(file)
    
    payload_json = {key: payload[key] for key in sorted(payload)}
    payload_json['match_ids'] = [i for i in range(len(payload_json[cn.ALERTED_PARTY_FIELD][cn.MATCH_RECORDS]))]
    payload = payload_json
    
    # transform standardized to cleansed
    self = pipeline
    match_ids = payload[cn.MATCH_IDS]
    parties = payload[cn.SUPPLEMENTAL_INFO][cn.RELATED_PARTIES][cn.PARTIES]

    for num, party in enumerate(parties):
        parties[num] = party["fields"]

    new_payloads = self.connect_input_record_with_match_record(payload)

    for payload in new_payloads:
        matches = payload[cn.ALERTED_PARTY_FIELD][cn.MATCH_RECORDS]
        fields = payload[cn.ALERTED_PARTY_FIELD][cn.INPUT_RECORD_HIST][0]["INPUT_FIELD"]
        for match in matches:
            WatchlistExtractor().update_match_with_wl_values(match)
            match[cn.TRIGGERED_BY] = self.engine.set_trigger_reasons(
                match, self.pipeline_config.FUZZINESS_LEVEL
            )
            self.engine.set_beneficiary_hits(match)

        self.engine.connect_full_names(parties)

        self.engine.collect_party_values(parties, payload)
        payload[cn.ALL_CONNECTED_PARTY_TYPES] = payload[cn.ALL_PARTY_TYPES]
        names_source_cols = [
            cn.ALL_PARTY_NAMES,
            cn.ALL_CONNECTED_PARTIES_NAMES,
        ]

        payload.update(
            {
                cn.CLEANED_NAMES: self.engine.get_clean_names_from_concat_name(
                    fields.get(cn.CONCAT_ADDRESS, None).value, # ??
                    {key: payload[key] for key in names_source_cols},
                )
            }
        )

        payload.update({cn.CONCAT_RESIDUE: payload[cn.CLEANED_NAMES][cn.CONCAT_RESIDUE]})

        concat_residue = payload[cn.CONCAT_RESIDUE]
        concat_address = fields.get(cn.CONCAT_ADDRESS, None).value # ??

        payload.update({cn.CONCAT_ADDRESS_NO_CHANGES: concat_residue == concat_address})
        for match in matches:
            match[cn.AP_TRIGGERS] = self.engine.set_triggered_tokens_discovery(
                payload, match, fields
            )
            
    # Transform standardized to application
    
    for payload in new_payloads:
        matches = payload[cn.ALERTED_PARTY_FIELD][cn.MATCH_RECORDS]
        agent_config, yaml_conf = self.load_agent_config()
        agent_input_prepended_agent_name_config = prepend_agent_name_to_ap_or_wl_or_aliases_key(
            agent_config
        )

        agent_input_agg_col_config = create_agent_input_agg_col_config(
            agent_input_prepended_agent_name_config
        )

        for match in matches:
            config = self.get_key(payload, match, yaml_conf)
            self.engine.sql_to_merge_specific_columns_to_standardized(
                agent_input_prepended_agent_name_config,
                match,
                config,
                False,
            )
            config.update(
                {
                    key: match.get(key)
                    for key in match
                    if key.endswith("_ap") or key.endswith("_wl")
                }
            )
            self.engine.sql_to_merge_specific_columns_to_standardized(
                agent_input_agg_col_config, match, config, True
            )
            
    '''for payload in new_payloads:
        # print('p')
        for match in payload['alert']['matchRecords']:
            # print('match')
            # print("++++++++")
            # print(match["ap_all_dobs_aggregated"], match["wl_all_dobs_aggregated"])
            try:
                ap = match["ap_all_residencies_aggregated"]
                wl = match["wl_all_residencies_aggregated"]
            except:
                break'''
