Skip to content

Commit

Permalink
0.6.14
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer committed Jul 15, 2023
1 parent 37537fa commit 3c748b3
Show file tree
Hide file tree
Showing 14 changed files with 330 additions and 337 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ pip install --upgrade git+https://github.com/mabel-dev/mabel
## Dependencies

>- **[orjson](https://github.com/ijl/orjson)** for JSON (de)serialization
>- **[siphashc](https://github.com/WeblateOrg/siphashc)** for non-cryptographic hashing
>- **[pydantic](https://pydantic-docs.helpmanual.io/)** to define internal data models
>- **[zstandard](https://github.com/indygreg/python-zstandard)** for real-time on disk compression
>- **[LZ4](https://github.com/python-lz4/python-lz4)** for real-time in memory compression
Expand Down
8 changes: 2 additions & 6 deletions mabel/data/internals/dictset.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from mabel.data.internals.storage_classes import StorageClassMemory
from mabel.errors import MissingDependencyError
from mabel.utils.ipython import is_running_from_ipython
from siphashc import siphash
from orso.cityhash import CityHash32


class STORAGE_CLASS(int, Enum):
Expand Down Expand Up @@ -522,15 +522,11 @@ def __hash__(self, seed: int = 703115) -> int:
Creates a consistent hash of the _DictSet_ regardless of the order of
the items in the _DictSet_.
"""

def sip(val):
return siphash("TheApolloMission", val)

# The seed is the mission duration of the Apollo 11 mission.
# 703115 = 8 days, 3 hours, 18 minutes, 35 seconds
ordered = map(lambda record: dict(sorted(record.items())), iter(self._iterator))
serialized = map(orjson.dumps, ordered)
hashed = map(sip, serialized)
hashed = map(CityHash32, serialized)
return reduce(lambda x, y: x ^ y, hashed, seed)

def __repr__(self): # pragma: no cover
Expand Down
10 changes: 3 additions & 7 deletions mabel/data/internals/group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from collections import defaultdict

import cython
from siphashc import siphash
from orso.cityhash import CityHash32


def summer(x, y):
Expand All @@ -26,8 +26,6 @@ def summer(x, y):
"AVG": lambda x, y: 1,
}

HASH_SEED = b"Anakin Skywalker"


class TooManyGroups(Exception):
pass
Expand Down Expand Up @@ -73,13 +71,11 @@ def _map(self, collect_columns):

for record in self._dictset:
try:
group_key: cython.uint64_t = siphash(
HASH_SEED,
group_key: cython.uint64_t = CityHash32(
"".join([str(record[column]) for column in self._columns]),
)
except KeyError:
group_key: cython.uint64_t = siphash(
HASH_SEED,
group_key: cython.uint64_t = CityHash32(
"".join([f"{record.get(column, '')}" for column in self._columns]),
)
if group_key not in self._group_keys.keys():
Expand Down
7 changes: 3 additions & 4 deletions mabel/data/internals/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@
from typing import Iterable

import orjson
from siphashc import siphash
from orso.cityhash import CityHash32

MAX_INDEX = 4294967295 # 2^32 - 1
SEED = "eschatologically" # needs to be 16 characters long

"""
There are overlapping terms because we're traversing a dataset so we can traverse a
Expand Down Expand Up @@ -68,7 +67,7 @@ def search(self, search_term) -> Iterable:
search_term = [search_term]
result: list = []
for term in search_term:
key = format(siphash(SEED, f"{term}") % MAX_INDEX, "x")
key = format(CityHash32(f"{term}") % MAX_INDEX, "x")
if key in self._index: # type:ignore
result[0:0] = self._index[key] # type:ignore
return result
Expand Down Expand Up @@ -100,7 +99,7 @@ def add(self, position, record):
if not isinstance(values, list):
values = [values]
for value in values:
entry = (format(siphash(SEED, f"{value}") % MAX_INDEX, "x"), position)
entry = (format(CityHash32(f"{value}") % MAX_INDEX, "x"), position)
ret_val.append(entry)
self.temporary_index += ret_val
return ret_val
Expand Down
5 changes: 2 additions & 3 deletions mabel/data/readers/internals/base_inner_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from mabel.logging import get_logger
from mabel.utils import dates
from mabel.utils import paths
from orso.cityhash import CityHash32

BUFFER_SIZE: int = 64 * 1024 * 1024 # 64Mb

Expand Down Expand Up @@ -124,9 +125,7 @@ def read_blob(self, blob: str) -> IOBase:
return io.BytesIO(result)

# hash the blob name for the look up
from siphashc import siphash

blob_hash = str(siphash("RevengeOfTheBlob", blob))
blob_hash = str(CityHash32(blob))

# try to fetch the cached file
result = cache_server.get(blob_hash)
Expand Down
12 changes: 6 additions & 6 deletions mabel/data/readers/internals/cursor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
midway through the blob if required.
"""
import orjson
from siphashc import siphash
from orso.cityhash import CityHash32


class InvalidCursor(Exception):
Expand All @@ -29,7 +29,7 @@ def __init__(self, readable_blobs, cursor=None):
self.load_cursor(cursor)

def load_cursor(self, cursor):
from bitarray import bitarray
from orso.bitarray import bitarray

if cursor is None:
return
Expand All @@ -46,7 +46,7 @@ def load_cursor(self, cursor):

self.location = cursor["location"]
find_partition = [
blob for blob in self.readable_blobs if siphash("%" * 16, blob) == cursor["partition"]
blob for blob in self.readable_blobs if CityHash32(blob) == cursor["partition"]
]
if len(find_partition) == 1:
self.partition = find_partition[0]
Expand All @@ -66,7 +66,7 @@ def next_blob(self, previous_blob=None):
if self.partition in self.readable_blobs:
return self.partition
partition_finder = [
blob for blob in self.readable_blobs if siphash("%" * 16, blob) == self.partition
blob for blob in self.readable_blobs if CityHash32(blob) == self.partition
]
if len(partition_finder) != 1:
raise ValueError(f"Unable to determine current partition ({self.partition})")
Expand Down Expand Up @@ -94,15 +94,15 @@ def get(self):
}

def __getitem__(self, item):
from bitarray import bitarray
from orso.bitarray import bitarray

if item == "map":
blob_map = bitarray(
"".join(["1" if blob in self.read_blobs else "0" for blob in self.readable_blobs])
)
return blob_map.tobytes().hex()
if item == "partition":
return siphash("%" * 16, self.partition)
return CityHash32(self.partition)
if item == "location":
return self.location
return None
Expand Down
4 changes: 2 additions & 2 deletions mabel/data/readers/internals/inline_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import orjson
from mabel.utils.dates import parse_iso
from siphashc import siphash
from orso.cityhash import CityHash32


def get_year(input):
Expand Down Expand Up @@ -223,7 +223,7 @@ def get_md5(item):
"BOOLEAN": lambda x: str(x).upper() != "FALSE",
"ISNONE": lambda x: x is None,
# HASHING & ENCODING
"HASH": lambda x: format(siphash("INCOMPREHENSIBLE", str(x)), "X"),
"HASH": lambda x: format(CityHash32(str(x)), "X"),
"MD5": get_md5,
"RANDOM": get_random, # return a random number 0-99
# OTHER
Expand Down

0 comments on commit 3c748b3

Please sign in to comment.