## Generate summary text descriptions for each Senzing entity

load the dependencies

In [1]:
import csv
import json
import pathlib
import re
import typing

from icecream import ic
import watermark

%load_ext watermark

In [2]:
%watermark
%watermark --iversions

Last updated: 2024-09-29T15:53:38.374449+02:00

Python implementation: CPython
Python version       : 3.12.5
IPython version      : 8.27.0

Compiler    : Clang 16.0.0 (clang-1600.0.26.3)
OS          : Darwin
Release     : 23.5.0
Machine     : arm64
Processor   : arm
CPU cores   : 10
Architecture: 64bit

json     : 2.0.9
csv      : 1.0
re       : 2.2.1
watermark: 2.5.0



load the Senzing export JSON

In [3]:
icij_path: pathlib.Path = pathlib.Path("data/ICIJ-entity-report-2024-06-21_12-04-57-std.json")

In [4]:
ents: dict = {}

with icij_path.open(encoding = "utf-8") as fp:
    while line := fp.readline():
        dat = json.loads(line.strip())
        ent: dict = dat["RESOLVED_ENTITY"]

        ent_id: str = ent["ENTITY_ID"]

        features: dict = {
            key: feature[0]["FEAT_DESC"]
            for key, feature in ent["FEATURES"].items()
        }

        ents[ent_id] = features

enumerate the features available on entities

In [5]:
feats: set = set()

for ent_feat in ents.values():
    feats |= ent_feat.keys()

feats

{'ADDRESS',
 'COUNTRY_OF_ASSOCIATION',
 'DOB',
 'DUNS_NUMBER',
 'GROUP_ASSOCIATION',
 'NAME',
 'PHONE',
 'RECORD_TYPE',
 'REL_ANCHOR',
 'REL_POINTER',
 'WEBSITE'}

load the country codes used within the ICIJ Offshore Leaks dataset

In [6]:
COUNTRIES: dict = {}

cc_file = "data/senzing/country.tsv"

with open(cc_file, "r", encoding = "utf-8") as fp:
    tsv_reader = csv.reader(fp, delimiter = "\t")
    next(tsv_reader, None)  # skip the header row
    
    COUNTRIES = {
        row[0]: row[1]
        for row in tsv_reader
    }


def get_country (
    code: typing.Optional[ str ],
    ) -> typing.Optional[ str ]:
    """
Map from a country code to a full name.
    """
    if code is None:
        return None

    return COUNTRIES.get(code.strip())

prepare to filter out anonymized names on bearer shares, e.g., "The Bearer"

In [7]:
PAT_LIST: typing.List[ str ] = [
    r"^\-?(to\s+)?([the]+\s+)?bearer\.?\s?(\d+)?(\w)?$",
    r"^.*bearer.*shares?$",
    r"^the\s+bearer\s+\([\d\,]+\)$",
    r"^[ae]l\s+portador$",
    r"^the\s?bearer$",
    r"^bearer\s?warrant$",
    r"^bearer\s?shareholder$",
    r"^the\,\s+bearer$",
    r"^bearer\s+\(reedeem\s+shares\)$",
    r"^the\s+bearer\s+\(lost\)$",
    r"^bearer\s+\-\s+[\w]$",
    r"^bearer\s+\"\w\"$",
    r"^bearer\s+[\d\-]+$",
    r"^bearer\s+no\.\s+\d+$",
    r"^the\s+bearer\s+at\s+[\d\,]+$",
    r"^nan$",
    r"^[\?]+$",
]


def filter_bearer (
    name: str,
    ) -> bool:
    """
These names are used to hide the identity of a company shareholder.
    """
    name = str(name).lower()

    for pat in PAT_LIST:
        if re.search(pat, name) is not None:
            return False

    return True

generate a summary description for each entity in the Senzing export

In [9]:
summaries: dict = {}

for ent_id, ent_feat in ents.items():
    if "NAME" in ent_feat:
        text: str = ent_feat.get("NAME")
        
        if filter_bearer(text.strip()):
            kind: str = ent_feat.get("RECORD_TYPE")

            if not kind:
                continue

            elif kind == "ORGANIZATION":
                if "ADDRESS" in ent_feat:
                    text += ", located at " + ent_feat.get("ADDRESS")

                if "DUNS_NUMBER" in ent_feat:
                    text += ", DUNS " + ent_feat.get("DUNS_NUMBER")

                if "PHONE" in ent_feat:
                    text += ", phone " + ent_feat.get("PHONE")

                if "COUNTRY_OF_ASSOCIATION" in ent_feat:
                    country: typing.Optional[ str ] = get_country(ent_feat.get("COUNTRY_OF_ASSOCIATION"))

                    if country is not None:
                        text += ", in " + country

                if "WEBSITE" in ent_feat:
                    text += ", website " + ent_feat.get("WEBSITE")

                summaries[ent_id] = text

            elif kind == "PERSON":
                if "DOB" in ent_feat:
                    text += ", born " + ent_feat.get("DOB")

                if "PHONE" in ent_feat:
                    text += ", phone " + ent_feat.get("PHONE")

                if "ADDRESS" in ent_feat:
                    text += ", located at " + ent_feat.get("ADDRESS")

                if "GROUP_ASSOCIATION" in ent_feat:
                    text += ", associated with " + ent_feat.get("GROUP_ASSOCIATION")

                if "COUNTRY_OF_ASSOCIATION" in ent_feat:
                    country: typing.Optional[ str ] = get_country(ent_feat.get("COUNTRY_OF_ASSOCIATION"))

                    if country is not None:
                        text += " in " + country

                summaries[ent_id] = text

            else:
                print(f"New entity type: {kind}")

In [10]:
summ_path: pathlib.Path = pathlib.Path("data/senzing/summaries.tsv")

with open(summ_path, "w", encoding = "utf-8") as fp:
    writer = csv.writer(fp, delimiter = "\t", lineterminator = "\n")
    writer.writerow([ "sz_ent_id", "summary" ])
    
    for ent_id, summary in summaries.items():
        writer.writerow([ ent_id, summary ])