## Initial experoments with names, name breaking and the current OS name matching

Goals: Get hands dirty on the problem and gain an initial understanding.

In [248]:
from dotenv import load_dotenv

load_dotenv()

True

### Create names with faker

In [73]:
from faker import Faker

In [279]:
fake = Faker("ar_AA")

fake.name()

'الآنسة اعتماد بنو مهدي'

In [74]:
import random
from typing import List

def randomized_locale_name(gen_funcs: List) -> str:
    return random.choice(gen_funcs)()

def name_gen_factory(locale: str = "ru_RU"):
    fake = Faker(locale)

    def gen_func():
        return f"{fake.first_name()} {fake.last_name()}"

    return gen_func

In [75]:
generate_german_name = name_gen_factory("de_DE")
generate_russian_name = name_gen_factory("ru_RU")
generate_arabic_name = name_gen_factory("ar_AA")

generators = [
    generate_german_name,
    generate_russian_name,
    generate_arabic_name,
]

In [77]:
# create a list of true and false matches
names = [randomized_locale_name(generators) for _ in range(1000)]

positive_pairs = [(n, n, 1.0) for n in names]
negative_pairs = [(n, random.choice(names), 0.0) for n in names]

In [78]:
for _ in range(10):
    print(randomized_locale_name(generators))

مرعي ميرفاب
Добромысл Макаров
Никита Морозова
عاقل عامر بن صعصعة
Софон Щербаков
رجائي رصاص
مهيب الشحوح
جليل عبد القيس
Антип Дорофеева
تميم المهرة


### Get the current OS matching algorithm from nomenklatura

Code from tests: https://github.com/opensanctions/nomenklatura/blob/main/tests/matching/test_logic.py

In [63]:
from nomenklatura.matching import LogicV1

In [64]:
from banal import ensure_list, hash_data
from followthemoney import model
from followthemoney.proxy import EntityProxy


def e(schema: str, **kwargs) -> EntityProxy:
    props = {}
    for key, value in kwargs.items():
        if value is not None:
            props[key] = ensure_list(value)
    data = {"schema": schema, "properties": props, "id": hash_data(props)}
    return EntityProxy.from_dict(model, data)

In [65]:
def matching_v1(left_name: str, right_name: str) -> float:
    a = e("Person", name=pair[0])
    b = e("Person", name=pair[1])
    return LogicV1.compare(a, b).score

In [66]:
# match all positive pairs and look for issues
for pair in positive_pairs:
    score = matching_v1(pair[0], pair[1])
    if score != pair[2]:
        print(pair)

In [67]:
# match all negative pairs and look for issues
for pair in negative_pairs:
    score = matching_v1(pair[0], pair[1])
    if score != pair[2]:
        print(pair, score)

('Hans-Karl Eberth', 'Hans-Georg Mude', 0.0) 0.3
('Мирон Доронина', 'Мирон Никифоров', 0.0) 0.45
('Eleonore Flantz', 'Элеонора Рогова', 0.0) 0.45
('Милий Блохин', 'Максим Блохина', 0.0) 0.45
('Илья Лапина', 'Илья Осипов', 0.0) 0.45
('Rita Drewes', 'Helge Drewes', 0.0) 0.45
('Гурий Уварова', 'Гурий Жукова', 0.0) 0.45
('Викентий Горшков', 'Викентий Сазонова', 0.0) 0.45
('Ираида Самсонова', 'Ираида Богданов', 0.0) 0.45
('Иванна Рожкова', 'Маргарита Рожкова', 0.0) 0.45
('Панфил Кондратьева', 'Ксения Кондратьева', 0.0) 0.45
('Delia Werner', 'Wulf Werner', 0.0) 0.45
('Евстафий Вишняков', 'Соломон Вишняков', 0.0) 0.45
('Игнатий Быков', 'Регина Быков', 0.0) 0.45
('Аскольд Федорова', 'Аскольд Шестакова', 0.0) 0.45
('Conrad Kobelt', 'Conrad Harloff', 0.0) 0.45
('Николай Панов', 'Ладислав Панова', 0.0) 0.45
('Ullrich Scheel', 'Gabi Scholl', 0.0) 0.45
('Freddy Reinhardt', 'Reinhardt Rust', 0.0) 0.45
('Heinz-Jürgen Ebert', 'Hannchen Heinz', 0.0) 0.3
('Giesela Schönland', 'Tatiana Schönland', 0.0) 0

## Distort names, try to break the matching

In [232]:
def random_distortion(name: str, distortion_functions: List) -> str:
    return random.choice(distortion_functions)(name)

def remove_whitespace(name: str) -> str:
    print("remove_whitespace")
    return name.replace(" ", "")

def remove_random_char(name: str) -> str:
    print("remove_random_char")
    return ''.join([char for i, char in enumerate(name) if i != random.randint(0, len(name)-1)])

def change_whitespace_order(name: str) -> str:
    print("change_whitespace_order")
    return " ".join(name.split(" ")[::-1])


distortion_functions = [
    remove_whitespace,
    remove_random_char,
    change_whitespace_order
]

In [247]:
name = "Stefan Hoffmann"
random_distortion(name, distortion_functions)

remove_whitespace


'StefanHoffmann'

In [254]:
## russian to german and back
import os
import requests

def translate_text(text_to_translate, target_language, api_key):
    # DeepL API endpoint
    api_url = "https://api-free.deepl.com/v2/translate"

    # Prepare the request parameters
    params = {
        'text': text_to_translate,
        'target_lang': target_language,
        'auth_key': api_key,
    }

    try:
        # Make the request
        response = requests.post(api_url, data=params)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the JSON response
            translation_data = response.json()

            # Extract and return the translated text
            translated_text = translation_data['translations'][0]['text']
            return translated_text
        else:
            print(f"Error: {response.status_code}")
            print(response.text)
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Example usage
api_key = os.environ.get('DEEPL_API_KEY')

In [273]:
orig_name = "Евстафий Вишняков"
translated_name = translate_text(text_to_translate=orig_name, target_language="DE", api_key=api_key)

In [274]:
translated_name

'Evstafiy Vishnyakov'

In [275]:
backtranslated_name = translate_text(text_to_translate=translated_name, target_language="RU", api_key=api_key)

In [276]:
assert backtranslated_name == orig_name

In [277]:
backtranslated_name

'Евстафий Вишняков'

In [278]:
matching_v1(translated_name, backtranslated_name)

0.0

In [283]:
### Try sentence transformers
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")

# Two lists of sentences
sentences1 = [
    'Rita Drewes', 
    'Илья Лапина', 
    'Ullrich Scheel',
]

sentences2 = [
    'Helge Drewes',
    'Илья Осипов',
    'Gabi Scholl',
]

# Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

# Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)

# Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(
        sentences1[i], sentences2[i], cosine_scores[i][i]
    ))

Rita Drewes 		 Helge Drewes 		 Score: 0.4902
Илья Лапина 		 Илья Осипов 		 Score: 0.7966
Ullrich Scheel 		 Gabi Scholl 		 Score: 0.6045
