# Match Entity Records

Demonstrates use of the Intelligence Toolkit library to compare groups in a dataset.

See [readme](https://github.com/microsoft/intelligence-toolkit/blob/main/app/workflows/match_entity_records/README.md) for more details.

In [1]:
import sys

sys.path.append("..")
import polars as pl
from toolkit.match_entity_records import (
    MatchEntityRecords,
    RecordsModel,
    AttributeToMatch,
)

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Create the workflow object
import os
from toolkit.AI.openai_configuration import OpenAIConfiguration


mer = MatchEntityRecords()

ai_configuration = OpenAIConfiguration(
    {
        "api_type": "OpenAI",
        "api_key": os.environ["OPENAI_API_KEY"],
        "model": "gpt-4o",
    }
)
mer.set_ai_configuration(ai_configuration)

data_1_file = "../example_outputs/match_entity_records/company_grievances/company_grievances_input_data_1.csv"
data_2_file = "../example_outputs/match_entity_records/company_grievances/company_grievances_input_data_2.csv"
data_1_df = pl.read_csv(data_1_file)
data_2_df = pl.read_csv(data_2_file)
print("Loaded data")

Loaded data


In [3]:
data_1_info = RecordsModel(
    dataframe=data_1_df,
    name_column="employer_name",
    columns=["sector", "address", "city", "country", "email", "phone", "owner"],
    dataframe_name="D1",
    id_column="employer_id",
)
data_2_info = RecordsModel(
    dataframe=data_2_df,
    name_column="company_name",
    columns=[
        "industry_sector",
        "street_address",
        "city_address",
        "country_address",
        "email_address",
        "phone_number",
        "company_owner",
    ],
    dataframe_name="D2",
    id_column="company_id",
)

mer.add_df_to_model(data_1_info)
mer.add_df_to_model(data_2_info)

print(
    f"Data model has **{len(mer.model_dfs)}** datasets with **{mer.total_records()}** total records."
)

Data model has **2** datasets with **4000** total records.


In [4]:
# Configure text embedding

attributes = []
attributes.append(AttributeToMatch({"columns": ["address::D1", "street_address::D2"]}))
attributes.append(AttributeToMatch({"columns": ["city::D1", "city_address::D2"]}))
attributes.append(AttributeToMatch({"columns": ["country::D1", "country_address::D2"]}))
attributes.append(AttributeToMatch({"columns": ["sector::D1", "industry_sector::D2"]}))
attributes.append(AttributeToMatch({"columns": ["owner::D1", "company_owner::D2"]}))
attributes.append(AttributeToMatch({"columns": ["phone::D1", "phone_number::D2"]}))
attributes.append(AttributeToMatch({"columns": ["email::D1", "email_address::D2"]}))

# build model with datasets and columns
mer.build_model_df(attributes)

# embed sentences in model dataset
await mer.embed_sentences()

100%|██████████| 600/600 [00:09<00:00, 62.12it/s]
100%|██████████| 600/600 [00:09<00:00, 64.68it/s]
100%|██████████| 600/600 [00:09<00:00, 64.73it/s] 
100%|██████████| 595/595 [00:11<00:00, 53.76it/s] 
100%|██████████| 597/597 [00:09<00:00, 61.76it/s]
100%|██████████| 598/598 [00:08<00:00, 71.52it/s] 
100%|██████████| 395/395 [00:06<00:00, 60.21it/s]


Got 15 existing texts
Got 3985 new texts


In [15]:
# set similarity thresholds
embedding_threshold = 0.02
jaccard_threshold = 0.75


detected_model = mer.detect_record_groups(
    embedding_threshold,
    jaccard_threshold,
)
f"Identified **{len(detected_model['Group ID'].unique())}** record groups."

'Identified **163** record groups.'

In [16]:
detected_model.head()

Group ID,Dataset,Entity ID,Entity name,address,city,company_owner,country,email,industry_sector,phone,Group size,Name similarity
i64,str,str,str,str,str,str,str,str,str,str,i64,f64
157,"""D1""","""3""","""Adventure Gear…","""555 Adventure …","""Outdoor City""","""Explorer Jane""","""ExploreLand""","""info@adventure…","""Retail""","""6655443322""",2,0.75
157,"""D1""","""4""","""Adventure Gear…","""555 Adventure …","""Outdoor City""","""Explorer Jane""","""ExploreLand""","""contact@advent…","""Retail""","""6655443322""",2,0.75
158,"""D1""","""330""","""Casey Builders…","""814 Builder St…","""Constructopoli…","""Jordan Casey""","""Buildland""","""contact@caseyb…","""Construction""","""5566778899""",2,0.75
158,"""D1""","""331""","""Casey Builders…","""820 Builder Av…","""Constructopoli…","""Jamie Casey""","""Buildland""","""info@caseybuil…","""Construction""","""5566778897""",2,0.75
159,"""D1""","""1006""","""GreenPower Sol…","""456 Eco Rd""","""GreenCity""","""Eco Light""","""EcoWorld""","""contact@greenp…","""Renewable Ener…","""9988773344""",2,0.75


In [17]:
# Generates AI report on selected data
report = mer.evaluate_groups()
print(report)

0,2,"Adventure Awaits and Adventure Gear have different addresses, cities, owners, countries, emails, and industry sectors, indicating they are unrelated."
1,8,"Adventure Gear, Adventure Gear Co., and Adventure Gear Company have similar names, addresses, cities, owners, countries, emails, and industry sectors, indicating they are likely related."
2,8,"Adventure Gear Co. and Adventure Gear Company have similar names, addresses, cities, owners, countries, emails, and industry sectors, indicating they are likely related."
3,2,"Adventure Outdoors and Adventure Seekers have different addresses, cities, owners, countries, emails, and industry sectors, indicating they are unrelated."
4,7,"Adventure Seekers and Adventure Seekers Travel have similar names, emails, and industry sectors, but different addresses, cities, owners, and countries, indicating they might be related."
5,2,"Adventure Tours and Adventure Tours Co. have different addresses, cities, owners, countries, emails, and industry se