Skip to content

Commit

Permalink
Eliminate non-matching pairs from cluster
Browse files Browse the repository at this point in the history
  • Loading branch information
pckhoi committed Jul 20, 2021
1 parent fe3a497 commit 476d787
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 31 deletions.
55 changes: 49 additions & 6 deletions datamatch/matchers.py
@@ -1,9 +1,9 @@
from datamatch.variators import Variator
import operator
import functools
import itertools
import math
from bisect import bisect_left, bisect
from operator import itemgetter
from typing import Iterator, Type
from typing import Any, Iterator, Type

import pandas as pd
import numpy as np
Expand All @@ -12,6 +12,7 @@
from .indices import BaseIndex
from .pairers import DeduplicatePairer, MatchPairer
from .filters import BaseFilter
from .variators import Variator


class ContinueOuter(Exception):
Expand Down Expand Up @@ -142,12 +143,52 @@ def _score_all_pairs(self):
)
)
pairs.append((sim, idx_a, idx_b))
self._pairs = sorted(pairs, key=itemgetter(0))
self._pairs = sorted(pairs, key=operator.itemgetter(0))
# in dedup mode we can group more than 2 records therefore we're not dropping lesser matches
if self._mode == MODE_MATCH:
self._remove_lesser_matches()
self._scores = [t[0] for t in self._pairs]

def _split_clusters(self, orig_cluster: set[tuple[float, Any, Any]]) -> list[tuple[frozenset, set]]:
paths: dict[Any, set] = {}
pairs: dict[frozenset, tuple[float, Any, Any]] = {}
nodes = set()
for sim, idx_a, idx_b in orig_cluster:
paths.setdefault(idx_a, set()).add(idx_b)
paths.setdefault(idx_b, set()).add(idx_a)
nodes.add(idx_a)
nodes.add(idx_b)
pairs[frozenset([idx_a, idx_b])] = (sim, idx_a, idx_b)
clusters: list[set] = []
clustered = set()
for node in nodes:
if node in clustered:
continue
cluster = set([node])
clustered.add(node)
queue = [node]
# BFS to find all members of cluster
while len(queue) > 0:
cur = queue.pop()
for neighbor in paths[cur]:
if neighbor in clustered:
continue
if all([n in paths[neighbor] for n in cluster]):
clustered.add(neighbor)
cluster.add(neighbor)
queue.append(neighbor)
clusters.append(cluster)
return [
(
frozenset(s),
set([
pairs[frozenset(y)]
for y in itertools.combinations(s, 2)
])
)
for s in clusters if len(s) > 1
]

def _get_clusters_dict_within_thresholds(self, lower_bound=0.7, upper_bound=1) -> dict[frozenset, set]:
pairs = self._pairs[
bisect_left(self._scores, lower_bound):
Expand All @@ -172,9 +213,11 @@ def _get_clusters_dict_within_thresholds(self, lower_bound=0.7, upper_bound=1) -
new_val.add((sim, idx_a, idx_b))
clusters.__setitem__(frozenset(new_key), new_val)

return clusters
return dict(functools.reduce(operator.add, [
self._split_clusters(cluster) for cluster in clusters.values()
]))

def get_index_clusters_within_thresholds(self, lower_bound=0.7, upper_bound=1) -> list[set]:
def get_index_clusters_within_thresholds(self, lower_bound=0.7, upper_bound=1) -> list[frozenset]:
"""Returns index clusters with similarity score within specified thresholds
Args:
Expand Down
60 changes: 36 additions & 24 deletions datamatch/test_matchers.py
Expand Up @@ -88,8 +88,12 @@ def test_deduplicate(self):
['bowen', 'latoya'],
['rhea', 'cherri'],
['rhea', 'cherrie'],
['b', 'freedie'],
['be', 'freedie'],
['du', 'demeia'],
['teneisha', 'green'],
['tyler', 'green'],
['te neisha', 'green'],
['t', 'green'],
], columns=cols)

matcher = ThresholdMatcher(NoopIndex(), {
Expand All @@ -98,36 +102,44 @@ def test_deduplicate(self):
}, df)

self.assertEqual(
matcher.get_index_clusters_within_thresholds(),
matcher.get_index_clusters_within_thresholds(0.83),
[
frozenset({6, 7}), frozenset({4, 5}),
frozenset({2, 3, 9}), frozenset({0, 1, 8}),
frozenset({6, 7}),
frozenset({4, 5}),
frozenset({2, 3, 9}),
frozenset({10, 12, 13}),
frozenset({0, 8, 1}),
],
)

self.maxDiff = None
print(matcher.get_clusters_within_threshold())
self.assertEqual(
matcher.get_clusters_within_threshold().to_string(),
matcher.get_clusters_within_threshold(0.83).to_string(),
'\n'.join([
' last first',
'cluster_idx pair_idx sim_score row_key ',
'0 0 0.990522 6 rhea cherri',
' 7 rhea cherrie',
'1 0 0.980748 2 dupas demia',
' 3 dupas demeia',
' 1 0.923472 3 dupas demeia',
' 9 du demeia',
' 2 0.902589 2 dupas demia',
' 9 du demeia',
'2 0 0.941913 4 brown latoya',
' 5 bowen latoya',
'3 0 0.939581 0 beech freddie',
' 1 beech freedie',
' 1 0.888144 1 beech freedie',
' 8 b freedie',
' 2 0.819520 0 beech freddie',
' 8 b freedie',
' last first',
'cluster_idx pair_idx sim_score row_key ',
'0 0 0.990522 6 rhea cherri',
' 7 rhea cherrie',
'1 0 0.985297 10 teneisha green',
' 12 te neisha green',
' 1 0.878609 10 teneisha green',
' 13 t green',
' 2 0.876863 12 te neisha green',
' 13 t green',
'2 0 0.980748 2 dupas demia',
' 3 dupas demeia',
' 1 0.923472 3 dupas demeia',
' 9 du demeia',
' 2 0.902589 2 dupas demia',
' 9 du demeia',
'3 0 0.941913 4 brown latoya',
' 5 bowen latoya',
'4 0 0.939581 0 beech freddie',
' 1 beech freedie',
' 1 0.923472 1 beech freedie',
' 8 be freedie',
' 2 0.857679 0 beech freddie',
' 8 be freedie',
]),
)

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
@@ -1,6 +1,6 @@
[metadata]
name = datamatch
version = 0.1.7
version = 0.1.8
author = Khoi Pham
author_email = pckhoi@gmail.com
description = Data matching utilities
Expand Down

0 comments on commit 476d787

Please sign in to comment.