# Chapter 5: Sorting and Searching

## 5.1 Hashing
The idea is as follows:
1. we have input data from a large range of numerical values, say N >> 1
2. we want to store all these data in some systematic way into a fixed d = O(1) slots
3. therefore, we have to create a function which divides the range of numerical values as equally as possible into d  sets. Such function is called the Hash function

### 5.1.1. Hash function
To construct a hash function, a common method is to employ the following theorm:
<p>
    <b>Theorem.</b> Suppose that $X_1,...,X_N$ are integer valued i.i.d.. Then $Y_N = X_1,...,X_N \bmod d$ converges in distribution to a uniform distribution as $N \rightarrow \infty$.
</p>
<p>
    Example include telephone number: 123-456-7890. One can make the hash function 123-456-7890 --> 12 + 34 + 56 +78 + 90 % 11. Often the digit of every other bloch of 2-digits are switched to ensure randomness (i.e. equally distributed mod 11)
</p>

### 5.1.2 Implementatino of hash table
We provide a preliminary implementation that consists of
- put(key, value) input the key-value pair
- get(key)
- del delete teh key-value pair
- len()
- in, True if key is in the hash table, False otherwise
- it is assume that key is an int

In [26]:
class HashTable(object):
    # __init__: self, int --> void
    # size = size of the hash table
    def __init__(self, size=11):
        self.size = size
        self.keys = [None] * self.size
        self.data = [None] * self.size
        self.tomb = {None} # place holder for deleted item
    
    # put: self, int, obj --> void
    # put puts the key-value pair into the hash table such that
    # - the key is unique in the hashtable: latest key is used in case of duplicates
    # - when the hashtable is filled, replacement occurs when put is called
    def put(self, key, value):
        init_hashvalue = self.hashfunction(key, self.size)
        cur_hashvalue = init_hashvalue
        
        while (self.keys[cur_hashvalue] != None) and \
                (self.keys[cur_hashvalue] != key) and \
                (self.keys[cur_hashvalue] != self.tomb):     
            cur_hashvalue = self.rehash(cur_hashvalue, self.size)
            if cur_hashvalue == init_hashvalue:
                break
        
        self.keys[cur_hashvalue] = key
        self.data[cur_hashvalue] = value
        
        
    
    # hashfunction: self, int, obj --> int
    def hashfunction(self, key, size):
        return key % size
    
    # rehash: self, int, obj, int --> int
    def rehash(self, key, size, step=1):
        return (key + step) % size
    
    # __str__: self --> string
    def __str__(self):
        ans = '{'
        
        for i in range(0, self.size):
            if (self.keys[i] == None) \
                    or (self.keys[i] == self.tomb):
                continue

            ans += str(self.keys[i]) + ' : ' + str(self.data[i]) + ', '

        if ans == '{':
            return '{}'
        else:
            return ans + '\b\b}'
    
    # get: int --> obj
    # returns the value associated to the key
    # returns None if no key is present
    # self.get assumes that self.input uses rehashing
    def get(self, key):
        init_hashvalue = self.hashfunction(key, self.size)
        cur_hashvalue = init_hashvalue
        
        while (self.keys[cur_hashvalue] != None):
            if key == self.keys[cur_hashvalue]:
                return self.data[cur_hashvalue]
            
            cur_hashvalue = self.rehash(cur_hashvalue, self.size)
            if cur_hashvalue == init_hashvalue:
                break
        
        return None
    
    # __getitem__: self, int --> obj
    # pverloading []
    def __getitem__(self, key):
        return self.get(key)
    
    # __setitem: self, key, obj --> void
    # overloading x[...] = ...
    def __setitem__(self, key, data):
        self.put(key, data)
        
    def len(self):
        length = self.size
        
        for i in self.keys:
            if i == None:
                length -= 1
        
        return length
    
    # __contains__: self, int --> bool
    # overloading in: check if key is in the hashtable 
    # using a statement of of the form
    # key in map
    def __contains__(self, key):
        init_hashvalue = self.hashfunction(key, self.size)
        cur_hashvalue = init_hashvalue
        
        while self.keys[cur_hashvalue] != None:
            if key == self.keys[cur_hashvalue]:
                return True
            else:
                cur_hashvalue = self.rehash(cur_hashvalue, self.size)
                if cur_hashvalue == init_hashvalue:
                    break
        
        return False
    
    # __delitem__:self, int --> void
    # delete the key-value pair using a statement of the form
    # del map[key]
    # do nothing if k is not in map
    def __delitem__(self, key):
        init_hashvalue = self.hashfunction(key, self.size)
        cur_hashvalue = init_hashvalue
        
        while self.keys[cur_hashvalue] != None:
            if (key == self.keys[cur_hashvalue]):
                self.keys[cur_hashvalue] = self.tomb
                self.data[cur_hashvalue] = None
                break
            elif  (self.tomb == self.keys[cur_hashvalue]):
                self.keys[cur_hashvalue] = None
                self.data[cur_hashvalue] = None
                break
            else:
                cur_hashvalue = self.rehash(cur_hashvalue, self.size)
                if cur_hashvalue == init_hashvalue:
                    break
        


In [36]:
def test1(f):
    x = f(5)
    
    print('put:')
    for i in range(0,11):
        x[i] = i
        print(i, x)
    print('\n')
    
    print('get and __contains__')
    for i in range(0,11):
        print(x[i])
        print(i in x)
    print('\n')
    
    # len()
    print('len')
    print(x.len())
    print('\n')
    
    # del
    print('__delitem__')
    print('x:', x)
    for i in range(0,12):
        del x[i]
        print(i, x)
    print('\n')

def test2(f, size=7):
    import numpy as np
    
    x = f(size)
    for i in range(0, size):
        x[i] = i
        print(i, x.keys)
        
    for i in range(0, 100):
        i = np.random.randint(3)
        a = np.random.randint(size)
        if i == 0:
            print('del {}'.format(a), end=' ')
            del x[a]
            print(x.keys)
        else:
            print('add {}'.format(a), end=' ')
            x[a] = 0
            print(x.keys)

In [29]:
test1(HashTable)

put:
0 {0 : 0, }
1 {0 : 0, 1 : 1, }
2 {0 : 0, 1 : 1, 2 : 2, }
3 {0 : 0, 1 : 1, 2 : 2, 3 : 3, }
4 {0 : 0, 1 : 1, 2 : 2, 3 : 3, 4 : 4, }
5 {5 : 5, 1 : 1, 2 : 2, 3 : 3, 4 : 4, }
6 {5 : 5, 6 : 6, 2 : 2, 3 : 3, 4 : 4, }
7 {5 : 5, 6 : 6, 7 : 7, 3 : 3, 4 : 4, }
8 {5 : 5, 6 : 6, 7 : 7, 8 : 8, 4 : 4, }
9 {5 : 5, 6 : 6, 7 : 7, 8 : 8, 9 : 9, }
10 {10 : 10, 6 : 6, 7 : 7, 8 : 8, 9 : 9, }


get and __contains__
None
False
None
False
None
False
None
False
None
False
None
False
6
True
7
True
8
True
9
True
10
True


len
5


__delitem__
x: {10 : 10, 6 : 6, 7 : 7, 8 : 8, 9 : 9, }
0 {10 : 10, 6 : 6, 7 : 7, 8 : 8, 9 : 9, }
1 {10 : 10, 6 : 6, 7 : 7, 8 : 8, 9 : 9, }
2 {10 : 10, 6 : 6, 7 : 7, 8 : 8, 9 : 9, }
3 {10 : 10, 6 : 6, 7 : 7, 8 : 8, 9 : 9, }
4 {10 : 10, 6 : 6, 7 : 7, 8 : 8, 9 : 9, }
5 {10 : 10, 6 : 6, 7 : 7, 8 : 8, 9 : 9, }
6 {10 : 10, 7 : 7, 8 : 8, 9 : 9, }
7 {10 : 10, 8 : 8, 9 : 9, }
8 {10 : 10, 9 : 9, }
9 {10 : 10, }
10 {}
11 {}




In [37]:
test2(HashTable, size=5)

0 [0, None, None, None, None]
1 [0, 1, None, None, None]
2 [0, 1, 2, None, None]
3 [0, 1, 2, 3, None]
4 [0, 1, 2, 3, 4]
add 4 [0, 1, 2, 3, 4]
add 3 [0, 1, 2, 3, 4]
add 1 [0, 1, 2, 3, 4]
add 0 [0, 1, 2, 3, 4]
add 2 [0, 1, 2, 3, 4]
add 2 [0, 1, 2, 3, 4]
del 4 [0, 1, 2, 3, {None}]
add 2 [0, 1, 2, 3, {None}]
add 0 [0, 1, 2, 3, {None}]
add 2 [0, 1, 2, 3, {None}]
add 4 [0, 1, 2, 3, 4]
del 0 [{None}, 1, 2, 3, 4]
add 0 [0, 1, 2, 3, 4]
add 4 [0, 1, 2, 3, 4]
add 0 [0, 1, 2, 3, 4]
add 4 [0, 1, 2, 3, 4]
del 0 [{None}, 1, 2, 3, 4]
del 1 [{None}, {None}, 2, 3, 4]
add 3 [{None}, {None}, 2, 3, 4]
del 3 [{None}, {None}, 2, {None}, 4]
add 1 [{None}, 1, 2, {None}, 4]
del 4 [{None}, 1, 2, {None}, {None}]
add 2 [{None}, 1, 2, {None}, {None}]
add 0 [0, 1, 2, {None}, {None}]
add 1 [0, 1, 2, {None}, {None}]
add 2 [0, 1, 2, {None}, {None}]
add 4 [0, 1, 2, {None}, 4]
del 2 [0, 1, {None}, {None}, 4]
add 2 [0, 1, 2, {None}, 4]
add 3 [0, 1, 2, 3, 4]
add 0 [0, 1, 2, 3, 4]
add 4 [0, 1, 2, 3, 4]
add 4 [0, 1, 2, 3, 4]

### 5.1.3 Implementatino of hash table II (To be complete)
This changes from 5.1.2 in the following ways:
- when the hashtable is full, automatically double the size
- use quadratic probing

In [69]:
class HashTable2(object):
    # __init__: self, int --> void
    # size = size of the hash table
    def __init__(self, size=11):
        self.length = 0 # current size
        self.size = size # max size
        self.keys = [None] * self.size
        self.data = [None] * self.size
        
    
    
    # put: self, int, obj --> void
    # put puts the key-value pair into the hash table such that
    # - the key is unique in the hashtable: latest key is used in case of duplicates
    # - when the hashtable is filled, replacement occurs when put is called
    def put(self, key, value):
        if self.size == self.length:
            self.length = 0
            self.expand()
                
        init_hashvalue = self.hashfunction(key, self.size)
        cur_hashvalue = init_hashvalue
        
        while (self.keys[cur_hashvalue] != None):
            if (self.keys[cur_hashvalue] == key):
                self.data[cur_hashvalue] = value
            else:
                cur_hashvalue = self.rehash(cur_hashvalue, self.size)
            
        self.keys[cur_hashvalue] = key
        self.data[cur_hashvalue] = value
        self.length += 1
    
    def expand(self):
        old_keys = self.keys
        old_data = self.data
        
        self.size *= 2
        self.keys = [None] * self.size
        self.data = [None] * self.size
        
        for i in range(0, self.size//2):
            self.put(old_keys[i], old_data[i])

        
    
    # hashfunction: self, int, obj --> int
    def hashfunction(self, key, size):
        return key % size
    
    # rehash: self, int, obj, int --> int
    def rehash(self, key, size, step=1):
        return (key + step) % size
    
    # __str__: self --> string
    def __str__(self):
        ans = '{'
        
        for i in range(0, self.size):
            if self.keys[i] == None:
                continue

            ans += str(self.keys[i]) + ' : ' + str(self.data[i]) + ', '

        if ans == '{':
            return '{}'
        else:
            return ans + '\b\b}'
    
    # get: int --> obj
    # returns the value associated to the key
    # returns None if no key is present
    # self.get assumes that self.input uses rehashing
    def get(self, key):
        init_hashvalue = self.hashfunction(key, self.size)
        cur_hashvalue = init_hashvalue
        
        while self.keys[cur_hashvalue] != None:
            if key == self.keys[cur_hashvalue]:
                return self.data[cur_hashvalue]
            
            cur_hashvalue = self.rehash(cur_hashvalue, self.size)
            if cur_hashvalue == init_hashvalue:
                break
        
        return None
    
    # __getitem__: self, int --> obj
    # pverloading []
    def __getitem__(self, key):
        return self.get(key)
    
    # __setitem: self, key, obj --> void
    # overloading x[...] = ...
    def __setitem__(self, key, data):
        self.put(key, data)
        
    def len(self):
        return self.length
    
    # __contains__: self, int --> bool
    # overloading in: check if key is in the hashtable 
    # using a statement of of the form
    # key in map
    def __contains__(self, key):
        init_hashvalue = self.hashfunction(key, self.size)
        cur_hashvalue = init_hashvalue
        
        while self.keys[cur_hashvalue] != None:
            if key == self.keys[cur_hashvalue]:
                return True
            
            cur_hashvalue = self.rehash(cur_hashvalue, self.size)
            if cur_hashvalue == init_hashvalue:
                break
        
        return False
    
    # __delitem__:self, int --> void
    # delete the key-value pair using a statement of the form
    # del map[key]
    # do nothing if k is not in map
    def __delitem__(self, key):
        init_hashvalue = self.hashfunction(key, self.size)
        cur_hashvalue = init_hashvalue
        
        while self.keys[cur_hashvalue] != None:
            if key == self.keys[cur_hashvalue]:
                self.keys[cur_hashvalue] = None
                self.data[cur_hashvalue] = None
                self.length -= 1
            
            cur_hashvalue = self.rehash(cur_hashvalue, self.size)
            if cur_hashvalue == init_hashvalue:
                break
        
        
