# Match Entity Records

Demonstrates use of the Intelligence Toolkit library to compare groups in a dataset.

See [readme](https://github.com/microsoft/intelligence-toolkit/blob/main/app/workflows/match_entity_records/README.md) for more details.

In [1]:
import sys

sys.path.append("..")
import polars as pl
from toolkit.match_entity_records.api import (
    MatchEntityRecords,
)

from toolkit.match_entity_records.classes import RecordsModel, AttributeToMatch

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Create the workflow object
import os
from toolkit.AI.openai_configuration import OpenAIConfiguration


mer = MatchEntityRecords()

ai_configuration = OpenAIConfiguration(
    {
        "api_type": "OpenAI",
        "api_key": os.environ["OPENAI_API_KEY"],
        "model": "gpt-4o",
    }
)
mer.set_ai_configuration(ai_configuration)
data_1_file = "../example_outputs/match_entity_records/company_grievances/company_grievances_input_data_1.csv"
data_2_file = "../example_outputs/match_entity_records/company_grievances/company_grievances_input_data_2.csv"
data_1_df = pl.read_csv(data_1_file)
data_2_df = pl.read_csv(data_2_file)
print("Loaded data")

Loaded data


In [3]:
data_1_info = RecordsModel(
    dataframe=data_1_df,
    name_column="employer_name",
    columns=["sector", "address", "city", "country", "email", "phone", "owner"],
    dataframe_name="D1",
    id_column="employer_id",
)
data_2_info = RecordsModel(
    dataframe=data_2_df,
    name_column="company_name",
    columns=[
        "industry_sector",
        "street_address",
        "city_address",
        "country_address",
        "email_address",
        "phone_number",
        "company_owner",
    ],
    dataframe_name="D2",
    id_column="company_id",
)

mer.add_df_to_model(data_1_info)
mer.add_df_to_model(data_2_info)

print(
    f"Data model has **{len(mer.model_dfs)}** datasets with **{mer.total_records}** total records."
)

Data model has **2** datasets with **4000** total records.


In [4]:
# Configure text embedding

attributes = []
attributes.append(AttributeToMatch({"columns": ["address::D1", "street_address::D2"]}))
attributes.append(AttributeToMatch({"columns": ["city::D1", "city_address::D2"]}))
attributes.append(AttributeToMatch({"columns": ["country::D1", "country_address::D2"]}))
attributes.append(AttributeToMatch({"columns": ["sector::D1", "industry_sector::D2"]}))
attributes.append(AttributeToMatch({"columns": ["owner::D1", "company_owner::D2"]}))
attributes.append(AttributeToMatch({"columns": ["phone::D1", "phone_number::D2"]}))
attributes.append(AttributeToMatch({"columns": ["email::D1", "email_address::D2"]}))

# build model with datasets and columns
mer.build_model_df(attributes)

# embed sentences in model dataset
await mer.embed_sentences()

Got 4000 existing texts
Got 0 new texts


In [14]:
# set similarity thresholds
embedding_threshold = 0.05
jaccard_threshold = 0.75


detected_model = mer.detect_record_groups(
    embedding_threshold,
    jaccard_threshold,
)
f"Identified **{len(detected_model['Group ID'].unique())}** record groups."

'Identified **664** record groups.'

In [15]:
detected_model.head()

Group ID,Dataset,Entity ID,Entity name,address,city,company_owner,country,email,industry_sector,phone,Group size,Name similarity
i64,str,str,str,str,str,str,str,str,str,str,i64,f64
696,"""D1""","""330""","""Casey Builders…","""814 Builder St…","""Constructopoli…","""Jordan Casey""","""Buildland""","""contact@caseyb…","""Construction""","""5566778899""",2,0.75
696,"""D1""","""331""","""Casey Builders…","""820 Builder Av…","""Constructopoli…","""Jamie Casey""","""Buildland""","""info@caseybuil…","""Construction""","""5566778897""",2,0.75
697,"""D1""","""436""","""Digital Dreams…","""1400 Cyber St""","""Tech City""","""Taylor Digital…","""Digitalia""","""info@digitaldr…","""Technology""","""1012345678""",2,0.75
697,"""D2""","""456""","""Digital Dreams…","""88 Cyber Stree…","""Tech City""","""Dream Weaver""","""Innovatia""","""support@digita…","""Technology""","""5566778899""",2,0.75
698,"""D1""","""518""","""EcoBuild Innov…","""1414 Eco St""","""Green City""","""Sam Right""","""Constructia""","""contact@ecobui…","""Construction""","""1122334455""",2,0.75


In [17]:
# Generates AI report on selected data
report = await mer.evaluate_groups()
print(report)

Group ID,Relatedness,Explanation
0,2,"The entities have similar names and are in the travel industry, but have different addresses, owners, and contact information, indicating they are likely unrelated."
1,8,"Adventure Gear, Adventure Gear Co., and Adventure Gear Company have similar names, addresses, and contact information, suggesting they are likely the same entity."
2,8,"Adventure Gear Co. and Adventure Gear Company have identical addresses and contact information, indicating they are likely the same entity."
3,2,"Adventure Outdoors and Adventure Seekers have similar names and are in the recreation industry, but have different addresses, owners, and contact information, indicating they are likely unrelated."
4,7,"Adventure Seekers and Adventure Seekers Travel have similar names and are in the travel industry, with similar contact information, suggesting they are likely related."
5,2,"Adventure Tours and Adventure Tours Co. have similar names and are in the travel industry, but have