# Chapter 12 - Managing Big Data 

## Reserving the right data (Reservoir sampling)

In [10]:
import string
datastream = list(string.ascii_uppercase) 
datastream += list(string.ascii_lowercase)
print(datastream)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [11]:
from random import seed, randint
seed(9) # change this value for different results
sample_size = 5
sample = []

In [12]:
for index, element in enumerate(datastream):
        # Until the reservoir is filled, we add elements
        if index < sample_size:
                sample.append(element)
        else:
                # Having filled the reservoir, we test a
                # random replacement based on the elements
                # seen in the data stream
                drawn = randint(0, index)
                # If the drawn number is less or equal the
                # sample size, we replace a previous element
                # with the one arriving from the stream
                if drawn < sample_size:
                        sample[drawn] = element

In [13]:
print(sample)

['y', 'e', 'v', 'F', 'i']


## Filtering stream elements by heart (Bloom Filter)

In [14]:
hash_functions = 3
bit_vector_length = 10
bit_vector = [0] * bit_vector_length

In [15]:
from hashlib import md5, sha1

def hash_f(element, i, length):
    """ This is a magic function """
    h1 = int(md5(element.encode('ascii')).hexdigest(), 16)
    h2 = int(sha1(element.encode('ascii')).hexdigest(), 16)
    return (h1 + i*h2) % length

def insert_filter(website):
    result = list()
    for hash_number in range(hash_functions):
        position = hash_f(website, hash_number, 
                          bit_vector_length)
        result.append(position)
        bit_vector[position] = 1
    print('Inserted in positions: %s' % result)

def check_filter(website):
    result = list()
    for hash_number in range(hash_functions):
        position = hash_f(website, hash_number, 
                          bit_vector_length)
        result.append((position,bit_vector[position]))
    print('Bytes in positions: %s' % result)

In [16]:
insert_filter('wikipedia.org')
print(bit_vector)

Inserted in positions: [0, 8, 6]
[1, 0, 0, 0, 0, 0, 1, 0, 1, 0]


In [17]:
insert_filter('youtube.com')
print(bit_vector)

Inserted in positions: [3, 0, 7]
[1, 0, 0, 1, 0, 0, 1, 1, 1, 0]


In [18]:
check_filter('yahoo.com')

Bytes in positions: [(7, 1), (5, 0), (3, 1)]
