# How to Detect Duplicates Using Hashing

#TODO: Write me..

In [15]:
import hashlib
from datetime import datetime
from pprint import pprint
import json

In [3]:
pprint(hashlib.algorithms_available)
pprint(hashlib.algorithms_guaranteed)

{'DSA',
 'DSA-SHA',
 'MD4',
 'MD5',
 'MDC2',
 'RIPEMD160',
 'SHA',
 'SHA1',
 'SHA224',
 'SHA256',
 'SHA384',
 'SHA512',
 'blake2b',
 'blake2s',
 'dsaEncryption',
 'dsaWithSHA',
 'ecdsa-with-SHA1',
 'md4',
 'md5',
 'mdc2',
 'ripemd160',
 'sha',
 'sha1',
 'sha224',
 'sha256',
 'sha384',
 'sha3_224',
 'sha3_256',
 'sha3_384',
 'sha3_512',
 'sha512',
 'shake_128',
 'shake_256',
 'whirlpool'}
{'blake2b',
 'blake2s',
 'md5',
 'sha1',
 'sha224',
 'sha256',
 'sha384',
 'sha3_224',
 'sha3_256',
 'sha3_384',
 'sha3_512',
 'sha512',
 'shake_128',
 'shake_256'}


## Hashing Introduction


What is a hash?

In [4]:
# Initialize the hasher
hasher = hashlib.sha256()

# let's hash something
hasher.update(b"I am a fun hash!")

print("I am a fun hash!: " + hasher.hexdigest())

hasher.update(b"To be or not to be")

print("I am a fun hash!  To be or not to be: " + hasher.hexdigest())

I am a fun hash!: 8a5495ee258d6b2b01284360026224ceb67451d111ce32611101a733c9252fe9
I am a fun hash!  To be or not to be: ba29113f3516acb39fda9a9db0338a58008fe776d2100d36bd163611a408d492


In [5]:
hashlib.sha224(b"Nobody inspects the spammish repetition").hexdigest()

'a4337bc45a8fc544c03f52dc550cd6e1e87021bc896588bd79e901e2'

## Example:  How to detect duplicate dictionaries

Suppose Alice sends Bob a series of JSON objects over the wire, but Alice is forgetful and sends Bob multiple JSON blobs containing the same data.  Bob doesn't want to have to read Alice's messages multiple times, so he needs a way to figure out if they're duplicate messages.

In [14]:
alices_messages = [
    {'message': 'Hello, Bob!', 'timestamp': datetime.now()},
    {'message': 'How are you doing, Bob?', 'timestamp': datetime.now()},
    {'id': 12454432, 'message': 'I see you Bob!', 'timestamp': datetime.now()},
    {'message': 'How are you doing, Bob?', 'timestamp': datetime.now()},
]

pprint(alices_messages)


# bob gets the message batch

for message in alices_messages:
    encoded_message = message.get('message').encode('utf-8')
    message_hash = hashlib.sha256(encoded_message).hexdigest()
    print(f'{encoded_message}: {message_hash}')

[{'message': 'Hello, Bob!',
  'timestamp': datetime.datetime(2019, 1, 16, 4, 13, 2, 796639)},
 {'message': 'How are you doing, Bob?',
  'timestamp': datetime.datetime(2019, 1, 16, 4, 13, 2, 796654)},
 {'id': 12454432,
  'message': 'I see you Bob!',
  'timestamp': datetime.datetime(2019, 1, 16, 4, 13, 2, 796665)},
 {'message': 'How are you doing, Bob?',
  'timestamp': datetime.datetime(2019, 1, 16, 4, 13, 2, 796675)}]
b'Hello, Bob!': 2558308eb3f6e132ab8ea8e38267d741ad28d99e5144e5767f4833517caa8d7f
b'How are you doing, Bob?': 58d51a0d8906345bc8d748eb35ed951e74375ac0660b4509cee4b3b32b5a6ef1
b'I see you Bob!': 75fc448b1b8a88cd7faafbe27dc1d256861cdf17c43b957a26c0e29877e02aee
b'How are you doing, Bob?': 58d51a0d8906345bc8d748eb35ed951e74375ac0660b4509cee4b3b32b5a6ef1


In [33]:
# Bob notices Alice is sending duplicate messages, and decides to create a dictionary to filter them.
duplicate_timestamp = datetime.now()

alices_messages = [
    {'message': 'Hello, Bob!', 'timestamp': datetime.now(), 'weather': 'cloudy with a chance of meatballs'},
    {'message': 'How are you doing, Bob?', 'timestamp': duplicate_timestamp},
    {'id': 12454432, 'message': 'I see you Bob!', 'timestamp': datetime.now()},
    {'message': 'How are you doing, Bob?', 'timestamp': duplicate_timestamp},
]

unique_messages = {}

for message in alices_messages:
    encoded_message = json.dumps(message, sort_keys=True, default=str)
    print(f'encoded message: {encoded_message}')
    message_hash = hashlib.sha256(encoded_message.encode('utf-8')).hexdigest()
    print(f'hash: {message_hash}')
    
    # check to see if message exists, and if not add it to the docket
    if message_hash not in unique_messages:
        unique_messages.update({message_hash: message})
    else:
        print('Silly Alice, she sent a duplicate message!')

encoded message: {"message": "Hello, Bob!", "timestamp": "2019-01-16 04:22:36.273661", "weather": "cloudy with a chance of meatballs"}
hash: 8dedfc71051b1b0ec17513c0a68c45d274dc4e6efef68e540ab507e451188eda
encoded message: {"message": "How are you doing, Bob?", "timestamp": "2019-01-16 04:22:36.273396"}
hash: b14c777503140bb3a752e39448ad800b45338fa6219461d0aad06ace01a040e3
encoded message: {"id": 12454432, "message": "I see you Bob!", "timestamp": "2019-01-16 04:22:36.273675"}
hash: 4e154fb2aeeffa2acef67b74acc042abafa81e986d2fb7bcae10d49131f0271c
encoded message: {"message": "How are you doing, Bob?", "timestamp": "2019-01-16 04:22:36.273396"}
hash: b14c777503140bb3a752e39448ad800b45338fa6219461d0aad06ace01a040e3
Silly Alice, she sent a duplicate message!


In [32]:
print(unique_messages)

{'920d25c5945a06816296e76be6fed3580bb90301b917a95d2e6435de51d313e2': {'message': 'Hello, Bob!', 'timestamp': datetime.datetime(2019, 1, 16, 4, 21, 48, 704614), 'weather': 'cloudy with a chance of meatballs'}, '8e68cdd36a43b86f1bc8a97fdfad68c0579b9a7bc63074ce5c2afd19f0e63b8e': {'message': 'How are you doing, Bob?', 'timestamp': datetime.datetime(2019, 1, 16, 4, 21, 48, 704211)}, '61f2ce83de73f8c8ca3781ae782e1dae4402f2b25e025e1461e6bc6467f1bd81': {'id': 12454432, 'message': 'I see you Bob!', 'timestamp': datetime.datetime(2019, 1, 16, 4, 21, 48, 704627)}}


## Cryptographically Secure Hashing