# Using Pre-trained Transformers for Matching

Start out by declaring a few constants.

In [1]:
import os

import polars as pl
from transformers import AutoTokenizer

DATADIR = os.path.abspath("../../data")
MODELDIR = os.path.abspath("../../models")

BERT_MODEL_NAME = "roberta-base"
LEFT_CSV_PATH = os.path.join(DATADIR, "abt-buy", "Abt.csv")
RIGHT_CSV_PATH = os.path.join(DATADIR, "abt-buy", "Buy.csv")
GROUND_TRUTH_PATH = os.path.join(DATADIR, "abt-buy", "abt_buy_perfectMapping.csv")

  from .autonotebook import tqdm as notebook_tqdm


Next, extract entity references and the ground truth from an existing CSV dataset.
The entity references are stored in an "ID" table.

In [2]:
from functools import partial
from matchescu.extraction import (
    RecordExtraction,
    single_record,
    Traits,
)
from matchescu.data_sources import CsvDataSource
from matchescu.reference_store.id_table import InMemoryIdTable
from matchescu.typing import EntityReferenceIdentifier

abt_traits = list(Traits().string(["name", "description"]).currency(["price"]))
abt = CsvDataSource(LEFT_CSV_PATH, traits=abt_traits).read()
buy_traits = list(
    Traits().string(["name", "description", "manufacturer"]).currency(["price"])
)
buy = CsvDataSource(RIGHT_CSV_PATH, traits=buy_traits).read()

gt = set(
    (
        EntityReferenceIdentifier(id_abt, abt.name),
        EntityReferenceIdentifier(id_buy, buy.name),
    )
    for id_abt, id_buy in pl.read_csv(
        os.path.join(DATADIR, "abt-buy", "abt_buy_perfectMapping.csv"),
        ignore_errors=True,
    ).iter_rows()
)


def _id(records, source):
    return EntityReferenceIdentifier(records[0]["id"], source)


def load_data_source(data_source: CsvDataSource) -> None:
    extract_references = RecordExtraction(
        data_source, partial(_id, source=data_source.name), single_record
    )
    for ref in extract_references():
        id_table.put(ref)


id_table = InMemoryIdTable()
load_data_source(abt)
load_data_source(buy)
original_comparison_space_size = len(abt) * len(buy)
print(
    f"total entity references: {len(id_table)}, original_comparison_space_size: {original_comparison_space_size}"
)

total entity references: 2173, original_comparison_space_size: 1180452


Next up, we create the comparison space.
A __binary__ comparison space is a list of pairs of entity reference identifiers.
The entity references identified in this way are deemed more suitable than others to match.
The comparison space is generated through blocking and filtering.

In [3]:
from matchescu.comparison_filtering import is_cross_source_comparison
from matchescu.blocking import TfIdfBlocker, SortedNeighborhoodBlocker, LSHBlocker
from matchescu.csg import BinaryComparisonSpaceGenerator, BinaryComparisonSpaceEvaluator

csg = (
    BinaryComparisonSpaceGenerator()
    .add_blocker(TfIdfBlocker(id_table, 0.25))
    .add_blocker(SortedNeighborhoodBlocker(id_table, 12))
    .add_blocker(LSHBlocker(id_table, 0.25))
    .add_filter(is_cross_source_comparison)
)
comparison_space = csg()
eval_cs = BinaryComparisonSpaceEvaluator(gt, original_comparison_space_size)
metrics = eval_cs(comparison_space)
print(metrics)
print("comparison space size:", len(comparison_space))

BlockingMetrics(pair_completeness=0.6061987237921604, pair_quality=0.03634872916097295, reduction_ratio=0.984501699349063)
comparison space size: 18295


Next, we need to load a pretrained matcher. This requires training a model.
We're using the [Ditto classifier](https://github.com/megagonlabs/ditto/tree/master/ditto_light).
To train Ditto using a BERT model, see the `matchescu.matching.ml.ditto.train` module.

In [4]:
from matchescu.matching.ml.ditto import DittoSimilarity

matcher = DittoSimilarity(
    AutoTokenizer.from_pretrained(BERT_MODEL_NAME),
    model_dir=MODELDIR,
    left_cols=("name", "description", "price"),
    right_cols=("name", "description", "manufacturer", "price"),
)
matcher.load_pretrained(BERT_MODEL_NAME)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


It's time to run the matcher. We need to compute and store the matcher's predictions separately in order to evaluate them.

In [5]:
from sklearn.metrics import precision_score, recall_score, f1_score

# comparison space ground truth
csgt = [int(pair in gt) for pair in comparison_space]
refs = list(map(tuple, map(id_table.get_all, comparison_space)))

# run matching algorithm on comparison space
match_scores = {(x, y): matcher(x, y) for x, y in refs}
pred = [int(v > matcher.match_threshold) for v in match_scores.values()]
print(f"ground truth size: {len(csgt)}, prediction size: {len(pred)}")

# evaluate matching performance
print(
    "precision: %.2f, recall: %.2f, F1: %.2f"
    % (
        precision_score(csgt, pred),
        recall_score(csgt, pred),
        f1_score(csgt, pred),
    )
)

ground truth size: 18295, prediction size: 18295
precision: 0.72, recall: 0.82, F1: 0.76


Finally, we construct the similarity graph.
Since not all similarity computations are symmetric (e.g neural networks with asymmetric activation functions like ReLU) => `matcher(a, b) != matcher(b, a)`.
That means that the similarity graph is a directed graph.

In [6]:
from matchescu.references import EntityReference
from functools import reduce
from itertools import starmap
from matchescu.similarity import SimilarityGraph
from pyresolvemetrics import precision, recall, f1


# at runtime, the evaluated matcher is used directly instead of this stub
class MatcherStub:
    def __init__(self, scores: dict, threshold: float):
        self.__threshold = threshold
        self.__match_scores = scores

    @property
    def non_match_threshold(self) -> float:
        return self.__threshold

    @property
    def match_threshold(self) -> float:
        return self.__threshold

    def __call__(self, a: EntityReference, b: EntityReference) -> float:
        return self.__match_scores[(a, b)]


# effectively make the match/non-match choice binary
simg = reduce(
    lambda x, pair: x.add(*pair),
    refs,
    SimilarityGraph(
        MatcherStub(match_scores, matcher.match_threshold),
        matcher.match_threshold,
        matcher.match_threshold
    ),
)
print(repr(simg))
# the scores should be the same as the previous cell scores
cs_true_matches = set(comparison_space) & set(gt)
sim_graph_matches = frozenset(starmap(lambda x, y: (x.id, y.id), simg.matches()))
print(
    "precision: %.2f, recall: %.2f, F1: %.2f"
    % (
        precision(cs_true_matches, sim_graph_matches),
        recall(cs_true_matches, sim_graph_matches),
        f1(cs_true_matches, sim_graph_matches),
    )
)

SimilarityGraph(nodes=2172, edges=755, match=755, non_match=17540, maybe=0)
precision: 0.72, recall: 0.82, F1: 0.76


In [17]:
import networkx as nx
from pyresolvemetrics import cluster_precision, cluster_recall, cluster_comparison_measure, rand_index, \
    adjusted_rand_index, twi, pair_precision, pair_recall, pair_comparison_measure
from matchescu.clustering import EquivalenceClassPartitioner, ConnectedComponents


unique_ids = list(map(lambda r: r.id, simg.nodes))
partitioner = EquivalenceClassPartitioner(unique_ids)
connected_components = ConnectedComponents(None)

clustering_methods = [
    (partitioner, partitioner(cs_true_matches), partitioner(sim_graph_matches)),
    (connected_components, frozenset(frozenset(ref_id for ref_id in cluster) for cluster in nx.connected_components(nx.Graph(cs_true_matches))), connected_components(simg)),
]
results = []
for clustering_method, gt_clusters, result_clusters in clustering_methods:
    results.append({
        "method": clustering_method.__class__.__name__,
        "Pair Precision": pair_precision(gt_clusters, result_clusters),
        "Pair Recall": pair_recall(gt_clusters, result_clusters),
        "Pair Comparison Measure": pair_comparison_measure(gt_clusters, result_clusters),
        "Cluster Precision": cluster_precision(gt_clusters, result_clusters),
        "Cluster Recall": cluster_recall(gt_clusters, result_clusters),
        "Cluster Comparison Measure": cluster_comparison_measure(gt_clusters, result_clusters),
        "Rand Index": rand_index(gt_clusters, result_clusters),
        "Adjusted Rand Index": adjusted_rand_index(gt_clusters, result_clusters),
        "Talburt-Wang Index": twi(gt_clusters, result_clusters),
    })

display(pl.DataFrame(results).transpose(include_header=True, header_name="method", column_names="method"))

method,EquivalenceClassPartitioner,ConnectedComponents
str,f64,f64
"""Pair Precision""",0.668347,0.616921
"""Pair Recall""",0.980769,0.776627
"""Pair Comparison Measure""",0.794964,0.687623
"""Cluster Precision""",0.951133,0.175606
"""Cluster Recall""",0.891175,0.730887
"""Cluster Comparison Measure""",0.920178,0.283175
"""Rand Index""",0.999866,0.999751
"""Adjusted Rand Index""",0.810487,0.882554
"""Talburt-Wang Index""",0.936961,6.082349
