# Entity Resolution using Deep Learning

In this notebook we're going to try to reproduce the results of some of the
systems described in the most complete survey [@barlaugsurvey2021] on entity
matching using deep learning techniques.
Additionally, where we can, we will compare the results obtained using those
techniques with logistic regression.

First things first: a few imports of modules and data.

In [1]:
import itertools
from dataclasses import dataclass

import numpy as np

from matchescu.data import EntityReferenceExtraction
from matchescu.matching.entity_reference import RawComparison
!test -f ~/requirements.txt && pip install -r ~/requirements.txt

In [2]:
import os
import polars as pl

from matchescu.data import EntityReferenceExtraction
from matchescu.matching.extraction import CsvDataSource, Traits, ListDataSource

Just like we've done previously, we'll be using only the Abt-Buy dataset.

In [3]:
LANG="en"
DATADIR = os.path.abspath("../../data")
LEFT_CSV_PATH = os.path.join(DATADIR, "abt-buy", "Abt.csv")
RIGHT_CSV_PATH = os.path.join(DATADIR, "abt-buy", "Buy.csv")
GROUND_TRUTH_PATH = os.path.join(DATADIR, "abt-buy", "abt_buy_perfectMapping.csv")

Unlike with our previous approaches, we're not quite ready to construct a
feature matrix.
While we could definitely use the previous extraction traits to provide a
feature matrix containing the similarities of co-referent attributes, we want to
improve upon our work so far.
To do so, we're going to attempt to implement Deepmatcher from scratch as it is
described in the [@deepmatcher2018] paper.
An important note is that we'll be implementing the Hybrid Deepmatcher approach
because it is the one with the highest success.

Firstly, we must embed attributes into sequences of word vectors.
To do this we're going to use NLTK to tokenize the input and fasttext to create
the character embeddings for every word.

In [4]:
import fasttext
import nltk

from fasttext.util import download_model


nltk.download("punkt")
download_model(LANG, if_exists="ignore")
ft_model = fasttext.load_model(f"cc.{LANG}.300.bin")

[nltk_data] Downloading package punkt to /Users/cusi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Now that we have downloaded the `punkt` package locally, we can easily tokenize
the words we find in each attribute in our two data sources.
Let's write a function that does that.

In [5]:
# Model hyperparameters
SIF_ALPHA = 1.0
INPUT_SIZE = ft_model.get_dimension()

In [6]:
from typing import Any, Generator, Iterable

import torch
from matchescu.typing import DataSource, Record


@dataclass
class TokenEmbedding:
    token: str
    embedding: torch.Tensor


def tokenize_attribute_value(value: Any) -> Generator[str, None, None]:
    if value is None:
        return
    if not isinstance(value, str):
        value = str(value)
    value = value.lower()
    yield from nltk.word_tokenize(value)


def embed_str(value: str) -> torch.Tensor:
    return torch.from_numpy(ft_model.get_word_vector(value))


def token_embedding(value: str) -> TokenEmbedding:
    return TokenEmbedding(value, embed_str(value))


def tokenize_words(record: tuple) -> list[list[TokenEmbedding]]:
    return [
        [
            token_embedding(token)
            for token in tokenize_attribute_value(value)
        ]
        for value in record
    ]


def tokenize_data_source(ds: DataSource[Record]) -> Generator[list[list[TokenEmbedding]], None, None]:
    yield from (tokenize_words(record) for record in ds)


def build_normalized_unigram_frequencies(processed_data_sources: Iterable[list[list[TokenEmbedding]]]) -> dict[str, float]:
    token_frequencies = {}
    word_count = 0
    for ds in processed_data_sources:
        for tokenized_record in ds:
            for token_embedding in tokenized_record:
                token_frequency = token_frequencies.get(token_embedding.token, 0)
                token_frequency += 1
                token_frequencies[token_embedding.token] = token_frequency
                word_count += 1
    return {
        token: token_frequency / word_count
        for token, token_frequency in token_frequencies.items()
    }


def compute_token_weights(token_embeddings: list[TokenEmbedding], frequency_table: dict[str, float], a: float = 1.0) -> torch.Tensor:
    return torch.Tensor([
        (a / (a + frequency_table[te.token]))
        for te in token_embeddings
    ])

In [7]:
from matchescu.matching.blocking import BlockEngine


abt_traits = Traits().int([0]).string([1, 2]).currency([3])
abt = CsvDataSource(name="abt", traits=abt_traits).read_csv(LEFT_CSV_PATH)
buy_traits = Traits().int([0]).string([1, 2, 3]).currency([4])
buy = CsvDataSource(name="buy", traits=buy_traits).read_csv(RIGHT_CSV_PATH)
print("pre-blocking", len(abt), len(buy))

block_engine = (
    BlockEngine(
        [
            EntityReferenceExtraction(abt, lambda x: x[0]),
            EntityReferenceExtraction(buy, lambda x: x[0])
        ]
    )
    .jaccard_blocks(1, 0.6)
    .jaccard_blocks(2, 0.6)
    .cross_sources_filter()
)
post_block_ds = block_engine.create_data_sources()
abt = post_block_ds["abt"]
buy = post_block_ds["buy"]
print("post-blocking", len(abt), len(buy))

pre-blocking 1081 1092
post-blocking 368 312


In [8]:
tokenized_abt = list(tokenize_data_source(abt))
tokenized_buy = list(tokenize_data_source(buy))
token_frequency_table = build_normalized_unigram_frequencies(itertools.chain(tokenized_abt, tokenized_buy))

In [9]:
class SifTensorTrait:
    def __init__(self, frequency_table: dict[str, float], alpha: float = 1.0, input_dim: int = 300) -> None:
        self._ft = frequency_table
        self._a = alpha
        self._n = input_dim

    def _extract_attribute_tensors(self, record: Record) -> Generator[torch.Tensor, None, None]:
        for attr_value in record:
            token_embeddings = list(map(token_embedding, tokenize_attribute_value(attr_value)))
            if len(token_embeddings) < 1:
                yield torch.zeros(self._n)
                continue
            token_weights = compute_token_weights(
                token_embeddings, self._ft, self._a
            ).reshape(len(token_embeddings), 1)
            word_embeddings = torch.atleast_2d(
                torch.stack([te.embedding for te in token_embeddings])
            )
            weighted_embeddings = token_weights * word_embeddings
            colwise_weighted_sum = weighted_embeddings.sum(dim=0)
            total_weight = token_weights.sum().float()
            yield colwise_weighted_sum / total_weight

    def __call__(self, record: Record) -> tuple[torch.Tensor, ...]:
        return tuple(t for t in self._extract_attribute_tensors(record))


In [10]:
sif = SifTensorTrait(token_frequency_table, SIF_ALPHA, INPUT_SIZE)
traits = Traits().int([0])
abt = ListDataSource(name="abt", traits=[*traits, sif]).extend(abt)
buy = ListDataSource(name="buy", traits=[*traits, sif]).extend(buy)
gt = set(
    pl.read_csv(
        os.path.join(DATADIR, "abt-buy", "abt_buy_perfectMapping.csv"),
        ignore_errors=True,
    ).iter_rows()
)

In [11]:
from matchescu.matching.ml.datasets import RecordLinkageDataSet
cmp_config = (
    RawComparison()
    .tensor_diff("name", 1, 1)
    .tensor_diff("description", 2, 2)
    .tensor_diff("name_manufacturer", 1, 3)
    .tensor_diff("description_manufacturer", 2, 3)
    .tensor_diff("price", 3, 4)
)

ds = RecordLinkageDataSet(abt, buy, gt).vector_compare(cmp_config).cross_sources()
X = ds.feature_matrix.to_numpy()
y = ds.target_vector.to_numpy()
print(X.shape, y.shape)
print(len(y[y==1]))

(114816, 1500) (114816,)
291


Let's add a few more helper functions that enable us to train an ANN classifier.

In [12]:
import torch

from sklearn.model_selection import train_test_split
from torch.nn.modules import loss
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

def get_torch_device():
    return f"mps:{torch.mps.device_count()-1}" if torch.mps.is_available() else "cpu"


def get_torch_generator():
    return torch.Generator(device=get_torch_device())


def create_dataloader(input_dataset: Dataset, batch_size: int = 32, shuffle: bool = True):
    return DataLoader(
        input_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
    )

In [13]:
X = ds.feature_matrix.to_numpy()
y = ds.target_vector.to_numpy()
print("total comparisons:", len(X))
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, stratify=y)
X_cv, X_test, y_cv, y_test = train_test_split(X_test, y_test, train_size=0.5, stratify=y_test)


def match_non_match_ratio(arr):
    k = np.count_nonzero(arr)
    n = len(arr)
    if n == k:
        return 0
    return round(k / (n - k), 5)


print("training dataset size:", X_train.shape, y_train.shape,"; match to non-match ratio:", match_non_match_ratio(y_train))
print("cross-validation dataset size:", X_cv.shape, y_cv.shape,"; match to non-match ratio:", match_non_match_ratio(y_cv))
print("test dataset size:", X_test.shape, y_test.shape,"; match to non-match ratio:", match_non_match_ratio(y_test))

total comparisons: 114816
training dataset size: (68889, 1500) (68889,) ; match to non-match ratio: 0.00255
cross-validation dataset size: (22963, 1500) (22963,) ; match to non-match ratio: 0.00253
test dataset size: (22964, 1500) (22964,) ; match to non-match ratio: 0.00253


In [14]:
class NpDataset(Dataset):
    def __init__(self, features: np.ndarray, labels: np.ndarray):
        self._features = features
        self._targets = labels

    def __len__(self):
        return len(self._features)

    def __getitem__(self, idx):
        features = torch.tensor(self._features[idx], dtype=torch.float32)
        targets = torch.tensor(self._targets[idx], dtype=torch.float32)
        return features, targets


train = NpDataset(X_train, y_train)
cv = NpDataset(X_cv, y_cv)
test = NpDataset(X_test, y_test)

loss_function = loss.CrossEntropyLoss()

In [15]:
from matchescu.matching.ml import TorchEngine
from matchescu.matching.ml.modules import HighwayMatchClassifier

N_EPOCHS = 3
CLASSIFIER_INPUT_SIZE = len(cmp_config) * INPUT_SIZE
TORCH_DEV = get_torch_device()

print("comparison vector input size:", CLASSIFIER_INPUT_SIZE)
print("training on device:", TORCH_DEV)

matcher = HighwayMatchClassifier(CLASSIFIER_INPUT_SIZE)
highway_engine = TorchEngine(matcher, loss_function, Adam(params=matcher.parameters(), lr=1e-3), TORCH_DEV)
highway_engine.train(create_dataloader(train), create_dataloader(cv), N_EPOCHS)

comparison vector input size: 1500
training on device: mps:0
Epoch 1/3, Train Loss: 0.3166, Val Loss: 0.3158
Epoch 2/3, Train Loss: 0.3158, Val Loss: 0.3158
Epoch 3/3, Train Loss: 0.3158, Val Loss: 0.3158


HighwayMatchClassifier(
  (highway-net): HighwayNetwork(
    (_scale_in): Linear(in_features=1500, out_features=512, bias=True)
    (_layers): ModuleList(
      (0-1): 2 x HighwayLayer(
        (_basic_processor): Linear(in_features=512, out_features=512, bias=True)
        (_transform_gate): Linear(in_features=512, out_features=512, bias=True)
      )
    )
    (_scale_out): Linear(in_features=512, out_features=2, bias=True)
  )
  (softmax): LogSoftmax(dim=-1)
)

In [16]:
test_loss = highway_engine.evaluate(create_dataloader(test), compute_stats=True)
print(f"Test Loss: {test_loss:.4f}")
print("Precision: %(precision).4f, Recall: %(recall).4f, F1: %(f1).4f" % highway_engine.stats)

Test Loss: 0.3158
Precision: 0.0000, Recall: 0.0000, F1: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from matchescu.matching.ml.modules import ResidualMatchClassifier

matcher = ResidualMatchClassifier(CLASSIFIER_INPUT_SIZE)
residual_engine = TorchEngine(matcher, loss_function, Adam(params=matcher.parameters(), lr=1e-3), TORCH_DEV)
residual_engine.train(create_dataloader(train), create_dataloader(cv), N_EPOCHS)

Epoch 1/3, Train Loss: 0.0314, Val Loss: 0.0173


In [None]:
test_loss = residual_engine.evaluate(create_dataloader(test), compute_stats=True)
print(f"Test Loss: {test_loss:.4f}")
print("Precision: %(precision).4f, Recall: %(recall).4f, F1: %(f1).4f" % residual_engine.stats)