# Prepare dataset and attributes

In [None]:
import sys

sys.path.append("../")
import polars as pl

from toolkit.match_entity_records.prepare_model import (
    build_attribute_options,
    format_dataset,
)

# Load data
df1 = pl.read_csv("file1.csv")
df2 = pl.read_csv("file2.csv")

# add dataset already with name, entity name, maybe id and attr columns

matching_dfs = {}
matching_dfs["file1"] = format_dataset(
    df1, [" JoinDate", " Salary  ", "EmployeeID"], " FullName"
)
matching_dfs["file2"] = format_dataset(
    df2, [" Start_Date", " Budget  ", "ProjID"], " TeamLead"
)

attr_options = build_attribute_options(matching_dfs)
attr_options

In [None]:
from toolkit.match_entity_records.prepare_model import build_attribute_list
from toolkit.match_entity_records.config import AttributeToMatch


attributes = [
    AttributeToMatch(
        {
            "label": " JoinDate",
            "columns": [" JoinDate::file1", " Start_Date::file2"],
        },
    )
]

atts_to_datasets = build_attribute_list(attributes)
atts_to_datasets

In [None]:
from toolkit.match_entity_records.detect import build_attributes_dataframe


merged_df = build_attributes_dataframe(
    matching_dfs,
    atts_to_datasets,
)
merged_df

In [None]:
merged_df = merged_df.with_columns(
    ((pl.col("Entity ID").cast(pl.Utf8)) + "::" + pl.col("Dataset")).alias("Unique ID")
)
merged_df

# Embed merged dataframe

In [None]:
import os
from toolkit.AI.openai_embedder import OpenAIEmbedder
from toolkit.AI.openai_configuration import OpenAIConfiguration
from toolkit.match_entity_records.detect import convert_to_sentences

ai_configuration = OpenAIConfiguration(
    {
        "api_type": "OpenAI",
        "api_key": os.environ["OPENAI_API_KEY"],
        "model": "gpt-4o-2024-08-06",
    }
)

text_embedder = OpenAIEmbedder(
    configuration=ai_configuration,
)

all_sentences = convert_to_sentences(merged_df)

if __name__ == "__main__":
    embedding_data = await text_embedder.embed_store_many(all_sentences)
    print(len(embedding_data), " embedding_data")

# Build scores and map

In [None]:
from toolkit.match_entity_records.detect import (
    build_near_map,
    build_nearest_neighbors,
    build_sentence_pair_scores,
)

embeddings = [embedding["vector"] for embedding in embedding_data]

distances, indices = build_nearest_neighbors(embeddings)

sentence_pair_embedding_threshold = 0.05
near_map = build_near_map(
    distances,
    indices,
    all_sentences,
    sentence_pair_embedding_threshold,
)

sentence_pair_scores = build_sentence_pair_scores(near_map, merged_df)

# Match dataset

In [None]:
from toolkit.match_entity_records.detect import build_matches, build_matches_dataset

matching_sentence_pair_jaccard_threshold = 0.75
entity_to_group, matches, pair_to_match = build_matches(
    sentence_pair_scores, merged_df, matching_sentence_pair_jaccard_threshold
)

matches_df_final = pl.DataFrame(
    list(matches),
    schema=["Group ID", *merged_df.columns],
).sort(by=["Group ID", "Entity name", "Dataset"], descending=False)

matches_df_final = build_matches_dataset(
    matches_df_final, pair_to_match, entity_to_group
)

print(matches_df_final.shape[0], "matches")
print(matches_df_final["Group ID"].n_unique(), "groups formed")
print(matches_df_final)

## AI Report

In [None]:
import os
from toolkit.AI.openai_configuration import OpenAIConfiguration
from toolkit.AI.client import OpenAIClient
from toolkit.match_entity_records.model import (
    prepare_for_ai_report,
)

batch_messages = prepare_for_ai_report(matches_df_final)
print(batch_messages)

ai_configuration = OpenAIConfiguration(
    {
        "api_type": "OpenAI",
        "api_key": os.getenv("OPENAI_API_KEY"),
        "model": "gpt-4o-2024-08-06",
    }
)

unique_names = matches_df_final["Entity name"].unique()
prefix = "```\nGroup ID,Relatedness,Explanation\n"
for messages in batch_messages:
    response = OpenAIClient(ai_configuration).generate_chat(messages, stream=False)
    if len(response.strip()) > 0:
        prefix = prefix + response + "\n"

result = prefix.replace("```\n", "").strip()
lines = result.split("\n")

if len(lines) > 30:
    lines = lines[:30]
    result = "\n".join(lines)
print(result)