# Using Pre-trained Transformers for Matching

Start out by declaring a few constants.

In [None]:
import os

import polars as pl
from transformers import AutoTokenizer

DATADIR = os.path.abspath("../../data")
MODELDIR = os.path.abspath("../../models")

BERT_MODEL_NAME = "roberta-base"
LEFT_CSV_PATH = os.path.join(DATADIR, "abt-buy", "Abt.csv")
RIGHT_CSV_PATH = os.path.join(DATADIR, "abt-buy", "Buy.csv")
GROUND_TRUTH_PATH = os.path.join(DATADIR, "abt-buy", "abt_buy_perfectMapping.csv")

Next, extract entity references and the ground truth from an existing CSV dataset.
The entity references are stored in an "ID" table.

In [7]:
from functools import partial
from matchescu.extraction import RecordIdAdapter, RecordExtraction, single_record, Traits
from matchescu.data_sources import CsvDataSource
from matchescu.reference_store.id_table import InMemoryIdTable
from matchescu.typing import EntityReferenceIdentifier

abt_traits = list(Traits().int(["id"]).string(["name", "description"]).currency(["price"]))
abt = CsvDataSource(LEFT_CSV_PATH, traits=abt_traits).read()
buy_traits = list(Traits().int(["id"]).string(["name", "description", "manufacturer"]).currency(["price"]))
buy = CsvDataSource(RIGHT_CSV_PATH, traits=buy_traits).read()

gt = set(
    (
        EntityReferenceIdentifier(id_abt, abt.name),
        EntityReferenceIdentifier(id_buy, buy.name),
    )
    for id_abt, id_buy in pl.read_csv(
        os.path.join(DATADIR, "abt-buy", "abt_buy_perfectMapping.csv"),
        ignore_errors=True,
    ).iter_rows()
)

def _id(record, source):
    return EntityReferenceIdentifier(record["id"], source)


def load_data_source(data_source: CsvDataSource) -> None:
    record_adapter = RecordIdAdapter(partial(_id, source=data_source.name))
    extract_references = RecordExtraction(data_source, record_adapter, single_record)
    for ref in extract_references():
        id_table.put(ref)


id_table = InMemoryIdTable()
load_data_source(abt)
load_data_source(buy)
original_comparison_space_size = len(abt) * len(buy)
print(f"total entity references: {len(id_table)}, original_comparison_space_size: {original_comparison_space_size}")

total entity references: 2173, original_comparison_space_size: 1180452


Next up, we create the comparison space.
A __binary__ comparison space is a list of pairs of entity reference identifiers.
The entity references identified in this way are deemed more suitable than others to match.
The comparison space is generated through blocking and filtering.

In [4]:
from matchescu.comparison_filtering import is_cross_source_comparison
from matchescu.blocking import TfIdfBlocker
from matchescu.csg import BinaryComparisonSpaceGenerator, BinaryComparisonSpaceEvaluator

csg = (
    BinaryComparisonSpaceGenerator()
    .add_blocker(TfIdfBlocker(id_table, 0.23))
    .add_filter(is_cross_source_comparison)
)
comparison_space = csg()
eval_cs = BinaryComparisonSpaceEvaluator(gt, original_comparison_space_size)
metrics = eval_cs(comparison_space)
print(metrics)

BlockingMetrics(pair_completeness=0.5569735642661805, pair_quality=0.05854172654977484, reduction_ratio=0.9911584715007472)


Next, we need to load a pretrained matcher. This requires training a model.
We're using the [Ditto classifier](https://github.com/megagonlabs/ditto/tree/master/ditto_light).
To train Ditto using a BERT model, see the `matchescu.matching.ml.ditto.train` module.

In [5]:
from matchescu.matching.ml.ditto import DittoMatcher

matcher = DittoMatcher(
    AutoTokenizer.from_pretrained(BERT_MODEL_NAME),
    model_dir=MODELDIR,
    left_cols=("name", "description", "price"),
    right_cols=("name", "description", "manufacturer", "price"),
)
matcher.load_pretrained(BERT_MODEL_NAME)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Finally, run the matcher and compute some metrics.

In [6]:
from sklearn.metrics import precision_score, recall_score, f1_score

# comparison space ground truth
csgt = [int(pair in gt) for pair in comparison_space]

# run matching algorithm on comparison space
pred = []
for left, right in comparison_space:
    pred.append(int(matcher(id_table.get(left), id_table.get(right))))

print(
    "precision: %.2f, recall: %.2f, F1: %.2f"
    % (
        precision_score(csgt, pred),
        recall_score(csgt, pred),
        f1_score(csgt, pred),
    )
)


precision: 0.98, recall: 0.74, F1: 0.84
