Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

* bug fixed: serialize simhash failed. #94

Merged
merged 1 commit into from
Nov 23, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 39 additions & 41 deletions data_juicer/ops/deduplicator/document_simhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,45 +21,45 @@
with AvailabilityChecking(['simhash-py'], OP_NAME):
import simhash

def local_num_differing_bits(hash_a, hash_b):
"""
Local implementation of calculating the number of different bits
between two integers.

def local_num_differing_bits(hash_a, hash_b):
"""
Local implementation of calculating the number of different bits between
two integers.

:param hash_a: integer hash value a
:param hash_b: integer hash value b
:return: number of different bits between input hashes.
"""
cnt = 0
n = hash_a ^ hash_b
while n != 0:
cnt += 1
n = n & (n - 1)
return cnt


def num_differing_bits_selector():
"""
Select a num_differing_bits method according to the Python version
installed.

When Python >= 3.9, the original simhash library cannot be compiled
correctly due to some changes in cython. After fixing this
incompatibility, RecursionError occurs sometimes when calling
simhash.num_differing_bits. So we use our implementation when Python
>= 3.9. Otherwise, we use implementation of simhash.

:return: an available num_differing_bits function.
"""
import platform
a, b, _ = platform.python_version().split('.')
if a == '3' and int(b) >= 9:
# for >= 3.9, use local implementation
return local_num_differing_bits
else:
# for < 3.9, use simhash version
return simhash.num_differing_bits
:param hash_a: integer hash value a
:param hash_b: integer hash value b
:return: number of different bits between input hashes.
"""
cnt = 0
n = hash_a ^ hash_b
while n != 0:
cnt += 1
n = n & (n - 1)
return cnt

def num_differing_bits_selector():
"""
Select a num_differing_bits method according to the Python version
installed.

When Python >= 3.9, the original simhash library cannot be compiled
correctly due to some changes in cython. After fixing this
incompatibility, RecursionError occurs sometimes when calling
simhash.num_differing_bits. So we use our implementation when Python
>= 3.9. Otherwise, we use implementation of simhash.

:return: an available num_differing_bits function.
"""
import platform
a, b, _ = platform.python_version().split('.')
if a == '3' and int(b) >= 9:
# for >= 3.9, use local implementation
return local_num_differing_bits
else:
# for < 3.9, use simhash version
return simhash.num_differing_bits

num_differing_bits = num_differing_bits_selector()


@OPERATORS.register_module(OP_NAME)
Expand Down Expand Up @@ -114,8 +114,6 @@ def __init__(self,
self.num_blocks = num_blocks
self.hamming_distance = hamming_distance

self.num_differing_bits = num_differing_bits_selector()

def compute_hash(self, sample):
"""
Compute simhash values for the sample.
Expand Down Expand Up @@ -189,7 +187,7 @@ def process(self, dataset, show_num=0):
dist = Counter()
for x, y in matches:
graph[x][y] = graph[y][x] = True
num_diff = self.num_differing_bits(x, y)
num_diff = num_differing_bits(x, y)
dist[num_diff] += 1
logger.info(f'Hash diff distribution: {dist}')

Expand Down