In [None]:
#|default_exp _iutils.hashing

In [None]:
#|hide
from nblite import nbl_export, show_doc

nbl_export()
import netrun._iutils.hashing as this_module

In [None]:
#|export
from typing import Any

import pickle
import pickletools
import zlib
import binascii
import hashlib
import struct
import xxhash
import json
from enum import Enum

In [None]:
#|hide
show_doc(this_module._preprocess_data)

## _preprocess_data

```python
_preprocess_data(data: Any, pickle_protocol: int, try_json_dump: bool)
```

Preprocesses and converts the data to bytes for hashing.

---


In [None]:
#|exporti
def _preprocess_data(data: Any, pickle_protocol: int, try_json_dump: bool):
    """
    Preprocesses and converts the data to bytes for hashing.
    """
    if try_json_dump:
        try:
            data = json.dumps(data).encode("utf-8")
        except TypeError:
            pass

    type_data = type(data)

    if type_data is bytes:
        return data
    elif type_data is str:
        return data.encode("utf-8")
    elif type_data is int:
        return data.to_bytes((data.bit_length() + 8) // 8, byteorder="big", signed=True)
    elif type_data is float:
        return struct.pack("!d", data)
    else:
        _data = pickle.dumps(data, protocol=pickle_protocol)
        return pickletools.optimize(_data)

In [None]:
#|hide
show_doc(this_module.adler32)

## adler32

```python
adler32(bdata: bytes) -> int
```

Compute portable hash for given data.

---


In [None]:
#|export
def adler32(bdata: bytes) -> int:
    """
    Compute portable hash for given data.
    """
    mask = 0xFFFFFFFF
    return zlib.adler32(bdata) & mask

In [None]:
#|hide
show_doc(this_module.crc32)

## crc32

```python
crc32(bdata: bytes) -> int
```

Compute portable hash using CRC32.

---


In [None]:
#|export
def crc32(bdata: bytes) -> int:
    """
    Compute portable hash using CRC32.
    """
    mask = 0xFFFFFFFF
    return binascii.crc32(bdata) & mask

In [None]:
#|hide
show_doc(this_module.sha256)

## sha256

```python
sha256(bdata: bytes) -> int
```

Compute hash using SHA-256.

---


In [None]:
#|export
def sha256(bdata: bytes) -> int:
    """
    Compute hash using SHA-256.
    """
    return int.from_bytes(hashlib.sha256(bdata).digest(), byteorder="big")

In [None]:
#|hide
show_doc(this_module.blake2b)

## blake2b

```python
blake2b(bdata: bytes) -> int
```

Compute hash using BLAKE2b.

---


In [None]:
#|export
def blake2b(bdata: bytes) -> int:
    """
    Compute hash using BLAKE2b.
    """
    return int.from_bytes(hashlib.blake2b(bdata).digest(), byteorder="big")

In [None]:
#|hide
show_doc(this_module.xxh64)

## xxh64

```python
xxh64(bdata: bytes) -> int
```

Compute hash using xxHash (64-bit).

---


In [None]:
#|export
def xxh64(bdata: bytes) -> int:
    """
    Compute hash using xxHash (64-bit).
    """
    return xxhash.xxh64(bdata).intdigest()

In [None]:
#|hide
show_doc(this_module.hash)

## hash

```python
hash(
    data: Any,
    method: HashMethod,
    pickle_protocol: int,
    try_json_dump: bool
) -> int
```

---


In [None]:
#|export
class HashMethod(Enum):
    adler32 = "adler32"
    crc32 = "crc32"
    sha256 = "sha256"
    blake2b = "blake2b"
    xxh64 = "xxh64"

def hash(data: Any, method: HashMethod, pickle_protocol: int, try_json_dump: bool) -> int:
    bdata = _preprocess_data(data, pickle_protocol=pickle_protocol, try_json_dump=try_json_dump)
    if method == HashMethod.adler32:
        return adler32(bdata)
    elif method == HashMethod.crc32:
        return crc32(bdata)
    elif method == HashMethod.sha256:
        return sha256(bdata)
    elif method == HashMethod.blake2b:
        return blake2b(bdata)
    elif method == HashMethod.xxh64:
        return xxh64(bdata)
    else:
        raise ValueError(f"Invalid hash method: {method}")

Try out the hashes

In [None]:
def hash_test(data, pickle_protocol, try_json_dump):
    return {
        method.value: hash(data, method, pickle_protocol=pickle_protocol, try_json_dump=try_json_dump)
        for method in HashMethod
    }

Hash a non-serializable Python object

In [None]:
pickle_protocol = 4
try_json_dump = False

class MyObj:
    pass
data = MyObj()
data.foo = "bar"

no_try_json_hashes = hash_test(data, pickle_protocol, try_json_dump)
for method, hash_value in no_try_json_hashes.items():
    print(f"{method}: {hash_value}")

adler32: 287444407
crc32: 3136097778
sha256: 13209249479534018038727288401382217406635484227402999563509092632806349651550
blake2b: 5742021342148487910258696566976121048463358226850003362743285825890293921266239456690521425019110196433655864103324773346113751372164491122179844754684279
xxh64: 2650106758741252352


Hashing it using `try_json_dump == True` should not make a difference

In [None]:
try_json_dump = True

try_json_hashes = hash_test(data, pickle_protocol, try_json_dump)
for hash_value1, hash_value2 in zip(no_try_json_hashes.values(), try_json_hashes.values()):
    assert hash_value1 == hash_value2

Hash a JSON-serializable value

In [None]:
pickle_protocol = 4
try_json_dump = False

data = {
    "foo": "bar",
}

no_try_json_hashes = hash_test(data, pickle_protocol, try_json_dump)
for method, hash_value in no_try_json_hashes.items():
    print(f"{method}: {hash_value}")

adler32: 1014367708
crc32: 2592268716
sha256: 97719036751630258470325823430456647924578343190400491720741504733485529460071
blake2b: 7867117342056934942205395767428606501681309652480188006592193483810874548160793558201678679101296496815200077137122048092707361723746073304557170538006501
xxh64: 3773599617194216955


Now `try_json_dump == True` should yield different hashes

In [None]:
try_json_dump = True

try_json_hashes = hash_test(data, pickle_protocol, try_json_dump)
for hash_value1, hash_value2 in zip(no_try_json_hashes.values(), try_json_hashes.values()):
    assert hash_value1 != hash_value2