Eliminate non-matching pairs from cluster

pckhoi · Jul 20, 2021 · 476d787 · 476d787
1 parent fe3a497
commit 476d787
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 31 deletions.
diff --git a/datamatch/matchers.py b/datamatch/matchers.py
@@ -1,9 +1,9 @@
-from datamatch.variators import Variator
+import operator
+import functools
 import itertools
 import math
 from bisect import bisect_left, bisect
-from operator import itemgetter
-from typing import Iterator, Type
+from typing import Any, Iterator, Type
 
 import pandas as pd
 import numpy as np
@@ -12,6 +12,7 @@
 from .indices import BaseIndex
 from .pairers import DeduplicatePairer, MatchPairer
 from .filters import BaseFilter
+from .variators import Variator
 
 
 class ContinueOuter(Exception):
@@ -142,12 +143,52 @@ def _score_all_pairs(self):
                 )
             )
             pairs.append((sim, idx_a, idx_b))
-        self._pairs = sorted(pairs, key=itemgetter(0))
+        self._pairs = sorted(pairs, key=operator.itemgetter(0))
         # in dedup mode we can group more than 2 records therefore we're not dropping lesser matches
         if self._mode == MODE_MATCH:
             self._remove_lesser_matches()
         self._scores = [t[0] for t in self._pairs]
 
+    def _split_clusters(self, orig_cluster: set[tuple[float, Any, Any]]) -> list[tuple[frozenset, set]]:
+        paths: dict[Any, set] = {}
+        pairs: dict[frozenset, tuple[float, Any, Any]] = {}
+        nodes = set()
+        for sim, idx_a, idx_b in orig_cluster:
+            paths.setdefault(idx_a, set()).add(idx_b)
+            paths.setdefault(idx_b, set()).add(idx_a)
+            nodes.add(idx_a)
+            nodes.add(idx_b)
+            pairs[frozenset([idx_a, idx_b])] = (sim, idx_a, idx_b)
+        clusters: list[set] = []
+        clustered = set()
+        for node in nodes:
+            if node in clustered:
+                continue
+            cluster = set([node])
+            clustered.add(node)
+            queue = [node]
+            # BFS to find all members of cluster
+            while len(queue) > 0:
+                cur = queue.pop()
+                for neighbor in paths[cur]:
+                    if neighbor in clustered:
+                        continue
+                    if all([n in paths[neighbor] for n in cluster]):
+                        clustered.add(neighbor)
+                        cluster.add(neighbor)
+                        queue.append(neighbor)
+            clusters.append(cluster)
+        return [
+            (
+                frozenset(s),
+                set([
+                    pairs[frozenset(y)]
+                    for y in itertools.combinations(s, 2)
+                ])
+            )
+            for s in clusters if len(s) > 1
+        ]
+
     def _get_clusters_dict_within_thresholds(self, lower_bound=0.7, upper_bound=1) -> dict[frozenset, set]:
         pairs = self._pairs[
             bisect_left(self._scores, lower_bound):
@@ -172,9 +213,11 @@ def _get_clusters_dict_within_thresholds(self, lower_bound=0.7, upper_bound=1) -
             new_val.add((sim, idx_a, idx_b))
             clusters.__setitem__(frozenset(new_key), new_val)
 
-        return clusters
+        return dict(functools.reduce(operator.add, [
+            self._split_clusters(cluster) for cluster in clusters.values()
+        ]))
 
-    def get_index_clusters_within_thresholds(self, lower_bound=0.7, upper_bound=1) -> list[set]:
+    def get_index_clusters_within_thresholds(self, lower_bound=0.7, upper_bound=1) -> list[frozenset]:
         """Returns index clusters with similarity score within specified thresholds
 
         Args:

diff --git a/datamatch/test_matchers.py b/datamatch/test_matchers.py
@@ -88,8 +88,12 @@ def test_deduplicate(self):
             ['bowen', 'latoya'],
             ['rhea', 'cherri'],
             ['rhea', 'cherrie'],
-            ['b', 'freedie'],
+            ['be', 'freedie'],
             ['du', 'demeia'],
+            ['teneisha', 'green'],
+            ['tyler', 'green'],
+            ['te neisha', 'green'],
+            ['t', 'green'],
         ], columns=cols)
 
         matcher = ThresholdMatcher(NoopIndex(), {
@@ -98,36 +102,44 @@ def test_deduplicate(self):
         }, df)
 
         self.assertEqual(
-            matcher.get_index_clusters_within_thresholds(),
+            matcher.get_index_clusters_within_thresholds(0.83),
             [
-                frozenset({6, 7}), frozenset({4, 5}),
-                frozenset({2, 3, 9}), frozenset({0, 1, 8}),
+                frozenset({6, 7}),
+                frozenset({4, 5}),
+                frozenset({2, 3, 9}),
+                frozenset({10, 12, 13}),
+                frozenset({0, 8, 1}),
             ],
         )
 
         self.maxDiff = None
-        print(matcher.get_clusters_within_threshold())
         self.assertEqual(
-            matcher.get_clusters_within_threshold().to_string(),
+            matcher.get_clusters_within_threshold(0.83).to_string(),
             '\n'.join([
-                '                                         last    first',
-                'cluster_idx pair_idx sim_score row_key                ',
-                '0           0        0.990522  6         rhea   cherri',
-                '                               7         rhea  cherrie',
-                '1           0        0.980748  2        dupas    demia',
-                '                               3        dupas   demeia',
-                '            1        0.923472  3        dupas   demeia',
-                '                               9           du   demeia',
-                '            2        0.902589  2        dupas    demia',
-                '                               9           du   demeia',
-                '2           0        0.941913  4        brown   latoya',
-                '                               5        bowen   latoya',
-                '3           0        0.939581  0        beech  freddie',
-                '                               1        beech  freedie',
-                '            1        0.888144  1        beech  freedie',
-                '                               8            b  freedie',
-                '            2        0.819520  0        beech  freddie',
-                '                               8            b  freedie',
+                '                                             last    first',
+                'cluster_idx pair_idx sim_score row_key                    ',
+                '0           0        0.990522  6             rhea   cherri',
+                '                               7             rhea  cherrie',
+                '1           0        0.985297  10        teneisha    green',
+                '                               12       te neisha    green',
+                '            1        0.878609  10        teneisha    green',
+                '                               13               t    green',
+                '            2        0.876863  12       te neisha    green',
+                '                               13               t    green',
+                '2           0        0.980748  2            dupas    demia',
+                '                               3            dupas   demeia',
+                '            1        0.923472  3            dupas   demeia',
+                '                               9               du   demeia',
+                '            2        0.902589  2            dupas    demia',
+                '                               9               du   demeia',
+                '3           0        0.941913  4            brown   latoya',
+                '                               5            bowen   latoya',
+                '4           0        0.939581  0            beech  freddie',
+                '                               1            beech  freedie',
+                '            1        0.923472  1            beech  freedie',
+                '                               8               be  freedie',
+                '            2        0.857679  0            beech  freddie',
+                '                               8               be  freedie',
             ]),
         )
 

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = datamatch
-version = 0.1.7
+version = 0.1.8
 author = Khoi Pham
 author_email = pckhoi@gmail.com
 description = Data matching utilities