diff --git a/datamatch/matchers.py b/datamatch/matchers.py index b604315..c047857 100644 --- a/datamatch/matchers.py +++ b/datamatch/matchers.py @@ -1,9 +1,9 @@ -from datamatch.variators import Variator +import operator +import functools import itertools import math from bisect import bisect_left, bisect -from operator import itemgetter -from typing import Iterator, Type +from typing import Any, Iterator, Type import pandas as pd import numpy as np @@ -12,6 +12,7 @@ from .indices import BaseIndex from .pairers import DeduplicatePairer, MatchPairer from .filters import BaseFilter +from .variators import Variator class ContinueOuter(Exception): @@ -142,12 +143,52 @@ def _score_all_pairs(self): ) ) pairs.append((sim, idx_a, idx_b)) - self._pairs = sorted(pairs, key=itemgetter(0)) + self._pairs = sorted(pairs, key=operator.itemgetter(0)) # in dedup mode we can group more than 2 records therefore we're not dropping lesser matches if self._mode == MODE_MATCH: self._remove_lesser_matches() self._scores = [t[0] for t in self._pairs] + def _split_clusters(self, orig_cluster: set[tuple[float, Any, Any]]) -> list[tuple[frozenset, set]]: + paths: dict[Any, set] = {} + pairs: dict[frozenset, tuple[float, Any, Any]] = {} + nodes = set() + for sim, idx_a, idx_b in orig_cluster: + paths.setdefault(idx_a, set()).add(idx_b) + paths.setdefault(idx_b, set()).add(idx_a) + nodes.add(idx_a) + nodes.add(idx_b) + pairs[frozenset([idx_a, idx_b])] = (sim, idx_a, idx_b) + clusters: list[set] = [] + clustered = set() + for node in nodes: + if node in clustered: + continue + cluster = set([node]) + clustered.add(node) + queue = [node] + # BFS to find all members of cluster + while len(queue) > 0: + cur = queue.pop() + for neighbor in paths[cur]: + if neighbor in clustered: + continue + if all([n in paths[neighbor] for n in cluster]): + clustered.add(neighbor) + cluster.add(neighbor) + queue.append(neighbor) + clusters.append(cluster) + return [ + ( + frozenset(s), + set([ + pairs[frozenset(y)] + for y in itertools.combinations(s, 2) + ]) + ) + for s in clusters if len(s) > 1 + ] + def _get_clusters_dict_within_thresholds(self, lower_bound=0.7, upper_bound=1) -> dict[frozenset, set]: pairs = self._pairs[ bisect_left(self._scores, lower_bound): @@ -172,9 +213,11 @@ def _get_clusters_dict_within_thresholds(self, lower_bound=0.7, upper_bound=1) - new_val.add((sim, idx_a, idx_b)) clusters.__setitem__(frozenset(new_key), new_val) - return clusters + return dict(functools.reduce(operator.add, [ + self._split_clusters(cluster) for cluster in clusters.values() + ])) - def get_index_clusters_within_thresholds(self, lower_bound=0.7, upper_bound=1) -> list[set]: + def get_index_clusters_within_thresholds(self, lower_bound=0.7, upper_bound=1) -> list[frozenset]: """Returns index clusters with similarity score within specified thresholds Args: diff --git a/datamatch/test_matchers.py b/datamatch/test_matchers.py index 93b7538..2ac5088 100644 --- a/datamatch/test_matchers.py +++ b/datamatch/test_matchers.py @@ -88,8 +88,12 @@ def test_deduplicate(self): ['bowen', 'latoya'], ['rhea', 'cherri'], ['rhea', 'cherrie'], - ['b', 'freedie'], + ['be', 'freedie'], ['du', 'demeia'], + ['teneisha', 'green'], + ['tyler', 'green'], + ['te neisha', 'green'], + ['t', 'green'], ], columns=cols) matcher = ThresholdMatcher(NoopIndex(), { @@ -98,36 +102,44 @@ def test_deduplicate(self): }, df) self.assertEqual( - matcher.get_index_clusters_within_thresholds(), + matcher.get_index_clusters_within_thresholds(0.83), [ - frozenset({6, 7}), frozenset({4, 5}), - frozenset({2, 3, 9}), frozenset({0, 1, 8}), + frozenset({6, 7}), + frozenset({4, 5}), + frozenset({2, 3, 9}), + frozenset({10, 12, 13}), + frozenset({0, 8, 1}), ], ) self.maxDiff = None - print(matcher.get_clusters_within_threshold()) self.assertEqual( - matcher.get_clusters_within_threshold().to_string(), + matcher.get_clusters_within_threshold(0.83).to_string(), '\n'.join([ - ' last first', - 'cluster_idx pair_idx sim_score row_key ', - '0 0 0.990522 6 rhea cherri', - ' 7 rhea cherrie', - '1 0 0.980748 2 dupas demia', - ' 3 dupas demeia', - ' 1 0.923472 3 dupas demeia', - ' 9 du demeia', - ' 2 0.902589 2 dupas demia', - ' 9 du demeia', - '2 0 0.941913 4 brown latoya', - ' 5 bowen latoya', - '3 0 0.939581 0 beech freddie', - ' 1 beech freedie', - ' 1 0.888144 1 beech freedie', - ' 8 b freedie', - ' 2 0.819520 0 beech freddie', - ' 8 b freedie', + ' last first', + 'cluster_idx pair_idx sim_score row_key ', + '0 0 0.990522 6 rhea cherri', + ' 7 rhea cherrie', + '1 0 0.985297 10 teneisha green', + ' 12 te neisha green', + ' 1 0.878609 10 teneisha green', + ' 13 t green', + ' 2 0.876863 12 te neisha green', + ' 13 t green', + '2 0 0.980748 2 dupas demia', + ' 3 dupas demeia', + ' 1 0.923472 3 dupas demeia', + ' 9 du demeia', + ' 2 0.902589 2 dupas demia', + ' 9 du demeia', + '3 0 0.941913 4 brown latoya', + ' 5 bowen latoya', + '4 0 0.939581 0 beech freddie', + ' 1 beech freedie', + ' 1 0.923472 1 beech freedie', + ' 8 be freedie', + ' 2 0.857679 0 beech freddie', + ' 8 be freedie', ]), ) diff --git a/setup.cfg b/setup.cfg index 46a24cd..580018d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = datamatch -version = 0.1.7 +version = 0.1.8 author = Khoi Pham author_email = pckhoi@gmail.com description = Data matching utilities