In [1]:
import yaml
import re
from pprint import pprint as pp

from business_layer.business_layer_ns import BusinessLayerNS

%load_ext autoreload
%autoreload 2

In [12]:
#order matters
POLICY_STEPS = [
    {
        "decision": "PTP", 
        "conditions": [
            {
                "name_name": ["MATCH", "STRONG_MATCH", "EXACT_MATCH"], 
                "geo_location": ["COUNTRY_MATCH", "CITY_MATCH", "STATE_MATCH"],
            },
            {
                "sanctions_location": "TRUE",
            },
        ],
    },
    { 
        "decision": "FP",
        "conditions": [
            {
                "name_name": ["HQ_NO_MATCH", "NO_MATCH", "WEAK_MATCH", "MATCH"],
                "geo_location": ["COUNTRY_NO_MATCH", "STATE_NO_MATCH", "CITY_NO_MATCH"],
            },
            {
                "name_name": ["HQ_NO_MATCH", "NO_MATCH", "WEAK_MATCH"],
                "geo_location": ["NO_DATA"],
            },
            {
                "sanctions_location": "FALSE",
                "entity_type_wl_type": "OTHER",
            },
        ]
    }
]

In [13]:
with open('business_layer/config_ns_bill_com.yaml', 'r') as cfg_file:
    cfg = yaml.safe_load(cfg_file)

In [14]:
biz_layer = BusinessLayerNS(cfg)

## Example

In [20]:
#NS example - mimicking the aggregated application layer from the ETL
alert_data = [
    {
        "ap_all_names_aggregated": ["Bill Gats"], "wl_all_names_aggregated": ["Bill Gates"],
        "wl_type": "INDIVIDUAL",
        "ap_all_locations_aggregated": ["Alabama"], "wl_all_locations_aggregated": ["Alabama"]
    },
    {
        "ap_all_names_aggregated": ["Microsoft Ltd"], "wl_all_names_aggregated": ["Microsoft"],
        "wl_type": "ORGANIZATION",
        "ap_all_locations_aggregated": ["Alabama"], "wl_all_locations_aggregated": ["Alabama"]
    },
    {
        "ap_all_names_aggregated": ["Microsoft"], "wl_all_names_aggregated": ["Facebook"],
        "wl_type": "ORGANIZATION",
        "ap_all_locations_aggregated": ["Redmont, USA"], "wl_all_locations_aggregated": ["Menlo Park, USA"]
    },
    {
        "ap_all_names_aggregated": ["Mirkosoft"], "wl_all_names_aggregated": ["Fejsbuk"],
        "wl_type": "OTHER",
        "ap_all_locations_aggregated": ["Pjongjang, North Korea"], 
        "wl_all_locations_aggregated": ["Antofagasta, Chile"],
    },
]

In [22]:
for data in alert_data:
    print(biz_layer.solve_hit(data, POLICY_STEPS))
    print()

SolvedHit(feature_vector={'geo_location': 'CITY_MATCH', 'entity_type_wl_type': 'INDIVIDUAL', 'name_name': 'MATCH', 'sanctions_location': 'FALSE'}, decision='PTP', comment="S8 recommended action: Potential True Positive\nAlerted Party's name () matches Watchlist Party's name ()\nAlerted Party's location (Alabama) matches Watchlist Party's location (Alabama)")

SolvedHit(feature_vector={'geo_location': 'CITY_MATCH', 'entity_type_wl_type': 'ORGANIZATION', 'name_name': 'MATCH', 'sanctions_location': 'FALSE'}, decision='PTP', comment="S8 recommended action: Potential True Positive\nAlerted Party's name (Microsoft Ltd) matches Watchlist Party's name (Microsoft)\nAlerted Party's location (Alabama) matches Watchlist Party's location (Alabama)")

SolvedHit(feature_vector={'geo_location': 'COUNTRY_MATCH', 'entity_type_wl_type': 'ORGANIZATION', 'name_name': 'NO_MATCH', 'sanctions_location': 'FALSE'}, decision='MI', comment='S8 recommended action: Manual Investigation')

SolvedHit(feature_vector={

## Data from bill

### Loading application data to one list

In [None]:
import json
import os

In [None]:
with open("../all_matches_application_data.json", "r") as file:
    all_matches_application_data = json.load(file)

### Decision cardinalities

In [None]:
from collections import Counter

In [None]:
Counter(match["decision"] for match in all_matches_application_data)

## Solving using KMB on full dataset

In [None]:
print("All matches length")
print(len(all_matches_application_data))

In [None]:
VALID_TYPES = {"ORGANIZATION", "INDIVIDUAL"}


def validate_match(match):
    if "Alert generated in error" not in match["reason"] and match["wl_type"] in VALID_TYPES:
        return True
    return False

In [None]:
import time

In [None]:
time_0 = time.time()
solved_all = [
    biz_layer.solve_hit(data=match, policy_steps=POLICY_STEPS)
    for match in all_matches_application_data
]
run_time = time.time() - time_0

In [None]:
# run time 1000: 42.4 sec
run_time

In [None]:
2858/60

In [None]:
all_results = [
    {
        **match, 
        ** solved.feature_vector,
        "kmb_decision": solved.decision if validate_match(match) else "MI",

    }
for match, solved in zip(all_matches_application_data, solved_all)]


In [None]:
with open("../output_all_results_name_agent_used.json", "w") as ftw:
    json.dump(all_results, ftw)

### PTP

In [None]:
Counter(result["kmb_decision"] for result in all_results)

In [None]:
name_does_not_match = [sol for sol in all_solutions 
                       if sol["reason"] == "Name does not match" and sol["kmb_decision"] == "PTP"]

In [None]:
name_does_not_match[0]

In [None]:
reason_counts_ptp_org = Counter(
    sol["org_name_solution"] for sol in all_solutions 
    if sol["kmb_decision"] == "PTP"
)
reason_counts_ptp_org.most_common(10)

In [None]:
reason_counts_ptp_geo = Counter(
    sol["geo_solution"] for sol in all_solutions 
    if sol["kmb_decision"] == "PTP"
)
reason_counts_ptp_geo.most_common(10)

### MI

In [None]:
reason_counts_mi = Counter(sol["reason"] for sol in all_solutions if sol["kmb_decision"] == "MI")
reason_counts_mi.most_common(10)

In [None]:
reason_counts_mi_OTHER = Counter(
    sol["reason"] for sol in all_solutions if sol["kmb_decision"] == "MI" and sol["wl_entity_type"] == "OTHER")
reason_counts_mi_OTHER.most_common(10)

In [None]:
reason_counts_mi_org = Counter(
    sol["org_name_solution"] for sol in all_solutions 
    if sol["kmb_decision"] == "MI"
)
reason_counts_mi_org.most_common(10)

In [None]:
reason_counts_mi_geo = Counter(
    sol["geo_solution"] for sol in all_solutions 
    if sol["kmb_decision"] == "MI"
)
reason_counts_mi_geo.most_common(10)

## Reasons comparison

In [None]:
def print_names(row):
    print("record_id", row["record_id"],
        "ap: ", row['ap_all_names_aggregated'], 
          " wl: ", row["wl_all_names_aggregated"],
          "ap_geo: ", row["ap_all_locations_aggregated"],
          "wl_geo: ", row["wl_all_locations_aggregated"],
          " reason: ",  row['reason'],
          "  KMB decision: ", row["kmb_decision"],
         " org: ", row["org_name_solution"],
         " ind: ", row["individual_solution"],
         " geo: ", row["geo_solution"])

In [None]:
Counter(sol["reason"] for sol in all_solutions).most_common(15)

In [None]:
def analyze_reason(bill_reason: str, our_reasons):
    print("REASON: ", bill_reason)
    reason_rows = [row for row in all_solutions if bill_reason.lower() in row["reason"].lower()]
    print("KMB Decision: ", Counter(row["kmb_decision"] for row in reason_rows))
    print("Org: ", Counter(row["org_name_solution"] for row in reason_rows))
    print("Individual: ", Counter(row["individual_solution"] for row in reason_rows))
    print("Geo: ", Counter(row["geo_solution"] for row in reason_rows))
    print("\n")
    not_matching_reasons = reason_rows
    for agent, solution in our_reasons:
        not_matching_reasons = [row for row in not_matching_reasons if row[agent] == solution]
    not_matching_len = len(not_matching_reasons)
    indexes = [0, 
               int(not_matching_len * 0.2), 
               int(not_matching_len * 0.3),
               int(not_matching_len * 0.4), 
               int(not_matching_len * 0.5), 
               int(not_matching_len * 0.6),
               int(not_matching_len * 0.7),
               int(not_matching_len * 0.8),
               not_matching_len - 1,
              ]
    for index in indexes:
        print_names(not_matching_reasons[index])
        print("\n")    

In [None]:
analyze_reason("address does not match", 
               [("geo_solution", "NO_DATA"), 
               ])

In [None]:
analyze_reason("country does not match", 
               [
                 ("geo_solution", "COUNTRY_MATCH"),
               ])

In [None]:
country_match = [match for match in all_matches_application_data if match["record_id"] == "123"]

In [None]:
country_match

In [None]:
analyze_reason("only partial name match", 
               [
                   ("org_name_solution", "MATCH"), 
               ]
              )

In [None]:
analyze_reason("only partial name match", 
               [
                   ("individual_solution", "MATCH"), 
               ]
              )

In [None]:
analyze_reason("name and country do not match", 
               [
                   ("geo_solution", "COUNTRY_MATCH"), 
                   ("org_name_solution", "MATCH")
               ]
              )

In [None]:
analyze_reason("name and country do not match", 
               [
                   ("org_name_solution", "MATCH"), 
               ]
              )

In [None]:
analyze_reason("name and country do not match", 
               [
                   ("individual_solution", "MATCH"), 
               ]
              )

In [None]:
analyze_reason("name and country do not match", 
               [
                   ("geo_solution", "COUNTRY_MATCH"), 
               ]
              )

In [None]:
MI = 27_318 + 14923
FP = 39_336
PTP = 752

ALL = MI + FP + PTP

print("FP: ", FP / ALL)
print("MI: ", MI / ALL)
print("PTP: ", PTP / ALL)