# Chapter 10 - Searching and Sorting
Marla Odell

In [2]:
from Objects.trees_and_nodes import *

**(10.1) Sorted Merge:** You are given two sorted arrays, A and B, where A has a large enough buffer at the end to hold B. Write a method to mege B into sorted order.

In [3]:
def sorted_merge(A, B):
    b, a = len(B) - 1, len(A) - len(B) - 1
    while not (a < 0 or b < 0): #While there are values in both lists
        if A[a] > B[b]:
            A[a + b + 1] = A[a]
            a -= 1
        else:
            A[a + b + 1] = B[b]
            b -= 1
    while not (b < 0): #Fill the rest of the list
        A[b] = B[b]
        b -= 1

**(10.2) Group Anagrams:** Write a method to sort an array of strings so that all the anagrams are next to each other.

In [4]:
def group_anagrams(strings):
    sorted_tuples = [(string, sorted(string)) for string in strings]
    sorted_tuples.sort(key = lambda i: i[1]) #Sort by the sorted strings (grouping anagrams)
    return [i[0] for i in sorted_tuples]

**(10.3) Search in Rotated Array:** Given a sorted array of n integers that has been rotated an unknown number of times, write code to find an element in the array. You may assume that the array was originally sorted in increasing order.

In [5]:
def search_in_rotated_array(values, key): #Finds location of key within values
    n = len(values)
    pivot = rotation_index(values, 0, n - 1)
    if pivot == None: #Array is not rotataed 
        return binary_search(values, 0, n - 1, key); 
    elif values[pivot] == key: 
        return pivot 
    elif values[0] > key: 
        return binary_search(values, pivot + 1, n - 1, key)
    return binary_search(values, 0, pivot - 1, key)

def rotation_index(values, i_left, i_right): #Looks for the index of pivot value (rotated about)
    if i_right < i_left:
        return None
    elif i_right == i_left: 
        return i_left 
    i_middle = (i_left + i_right) // 2 
    if i_middle > i_left and values[i_middle] < values[i_middle - 1]: 
        return i_middle - 1
    elif i_middle < i_right and values[i_middle] > values[i_middle + 1]: 
        return i_middle 
    elif values[low] >= values[i_middle]: 
        return rotation_index(values, i_left, i_middle-1) 
    return rotation_index(values, i_middle + 1, i_right) 

def binary_search(values, i_left, i_right, key): 
    if i_right >= i_left: 
        i_middle = (i_left + i_right) // 2
        if values[i_middle] == key: 
            return i_middle 
        if values[i_middle] > key: 
            return binary_search(values, i_left, i_middle - 1, key) 
        return binary_search(values, i_middle + 1, i_right, key) 
    return None

**(10.4) Sorted Search, No Size:** You are given an array-like data structure ```Listy``` which lacks a size method. It does, however have an ```elementAt(i)``` method that returns the element at index i in *O(1)* time. If i is beyond the bounds of the data structure it returns -1. (For this reason, the data structure only supports posivite integers.) Given a Listy which contains sorted, positive integers, find the index at which an element x occurs. If x occurs multiple times, you may return any index. 

In [6]:
class Listy:
    
    def __init__(self, values):
        self.values = values
        
    def elementAt(self, i):
        if i < len(self.array):
            return self.values[i]
        return -1
    
def sorted_search(listy, key): 
    i_left, i_right = 0, 1
    current = listy.values[0] 
    while current < key: 
        i_left, i_right = i_right, 2 * i_right #Doubles index as you search the unknown size array
        current = listy.values[i_right]
    return binary_search(listy.values, i_left, i_right, key) #Uses binary search method from (10.3)

**(10.5) Sparse Search:** Given a sorted array of strings that is interspersed with empty strings write a method to find the location of a given string.

In [7]:
def sparse_search(strings, key, i_left = None, i_right = None):
    i_left = 0 if i_left is None else i_left
    i_right = len(strings) - 1 if i_right is None else i_right
    i, j = 0, 0
    i_middle = (i_left + i_right) // 2
    if strings[i_middle] == '':
        i, j = i_middle - 1, i_middle + 1
        while True: #Looking for closest non-empty string
            if i < i_left and j > i_right:
                return None 
            elif (i >= i_left) and (strings[i] != ''): 
                i_middle = i 
                break
            elif (j <= i_right) and (strings[j] != ''): 
                i_middle = j 
                break
    if strings[i_middle] == key: 
        return i_middle   
    elif strings[i_middle] < key: 
        return sparse_search(strings, key, i_middle + 1, i_right)
    elif strings[i_middle] > key: 
        return sparse_search(strings, key, i_left, i_middle - 1)
    i -= 1
    j += 1

**(10.6) Sort Big File:** Imagine you have a 20GB file with one string per line. Explain how you would sort the file. 

Use an external sorting algorithm, such that you divide the 20GB file into chunks of m-size (where m is the amount of memory avaialble). Sort each chunk independently (bringing it into working memory, preforming a conventional sorting algorithm locally, and then saving the sorted version externally). Finally perform a (20/m)-way merge of the sorted chunks together to get the final sorting. 

**(10.7) Missing Int:** Given an input file with four billion non-negative integers, provide an algorithm to generate an integer that is not contained in the file. Assume you have 1GB of memory available for the task.

In [8]:
def traverse_int(int_list):
    bit_vec = [0] * (2**8) #Change for actual use: (2**32)
    for i in int_list:
        bit_vec[i] = 1 #Marks value as seen
    return bit_vec

def find_missing(bit_vec):
    for i in range(len(bit_vec)):
        if bit_vec[i] == 0:
            return i

def missing_int(int_list):
    bit_vec = traverse_int(int_list)
    return find_missing(bit_vec)

Follow Up: What if you have only 10 MB of memory? Assume that all the values are distinct and we now have no more than one billion non-negative integers.

In [9]:
def first_pass(int_list, max_val, block_len):
    blocks = [[] for i in range(max_val // block_len + 1)]
    for i in int_list:
        blocks[(i // block_len)].append(i) #Catagorize by ranges
    return blocks

def check_length(blocks, block_len):
    for i in range(len(blocks)):
        if len(blocks[i]) != block_len: #Check if range contains a missing value
            return i
    return None

def second_pass(blocks, b, block_len):
    bit_vec = [0] * block_len
    for i in blocks[b]:
        bit_vec[i - (block_len*b)] = 1 #Marks value as seen
    return bit_vec

def find_missing_int(bit_vec):
    for i in range(len(bit_vec)):
        if bit_vec[i] == 0:
            return i
    return None

def missing_int_follow_up(int_list):
    max_val, block_len = 11, 4 #Adjust to Integer.MAX_VALUE, 2**17
    blocks = first_pass(int_list, max_val, block_len)
    short_block_i = check_length(blocks, block_len)
    bit_vec = second_pass(blocks, short_block_i, block_len)  
    missing_i = find_missing_int(bit_vec)
    return missing_i +(block_len*short_block_i)

**(10.8) Find Duplicates:** You have an array with all the numbers from 1 to N, where N is at most 32,000. The array may have duplicate entries and you do not know what N is. With only 4 kilobytes of memory available, how would you print all duplicate elements in the array?

In [10]:
def find_duplicates(array):
    bit_vec = [0] * 500 #(500 x 64) = 32,000 <= 4kb
    duplicates = []
    for i in array:
        bit = 1 << (i % 64)
        position = i // 64
        if not (bit_vec[position] & bit):
            bit_vec[position] |= bit
        else: 
            duplicates.append(i)
    return duplicates #Changed from print to retrun for testing

**(10.9) Sorted Matrix Search:** Given an M x N matrix in which each row and each column is sorted in ascending order, write a method to find an element.

In [11]:
def sorted_matrix_search(matrix, N, e): 
    i,j = 0, N-1 #Index of right / smallest element
    while (i < N and j >= 0):
        if (matrix[i][j] == e): #Found element
            return (i,j)
        elif (matrix[i][j] < e):
            i += 1
        else:
            j -= 1
    return None #Element is not in the matrix 

**(10.10) Rank from Stream:** Imagine you are reading in a stream of integers. Periodically, you wish to be able to look up the rank of a number x (the number of values less than or equal to x). Implement the data structures and algorithms to support these operations. That is, implement the method ```track(int x)```, which is called when each number is generated, and the method ```getRankOfNumber(int x)```, which returns the number of values (less than or equal to x (not including x itself).

In [12]:
class RankBinarySearchTree(BinarySearchTree):

    def track(self, val):
        BinarySearchTree.put(val)
        
    def getRankOfNumber(self, data):
        if not self.root:
            return None
        return self._getRankOfNumber(self.root, data)

    def _getRankOfNumber(self, node, val, found = False):
        if not node:
            return 0
        count = 0
        count += self._getRankOfNumber(node.left, val, found)
        if val <= node.val:
            if not found and val == node.val:
                found = True
            elif found:
                count += 1
        else:
            count += self._getRankOfNumber(node.right, val, found) \
                  + 1
        return count

**(10.11) Peaks and Valleys:** In an array of integers, a "peak" is an element which is greater than or equal to the adjacent integers and a "valley" is an element which is less than or equal to the adjacent integers. For example, in the array (5, 8, 6, 2, 3, 4, 6), (8,6) are peaks and (5, 2) are valleys. Given an array of integers, sort the array into an alternating sequence of peaks and valleys.

In [13]:
def zoom_on_three(array, size, a, b, c): 
    a_val, b_val, c_val = array[a], array[b], array[c] 
    min_val = min(a_val, b_val, c_val)
    if min_val == b_val: #Swap values to put the smallest integer in the middle
        return (a_val, b_val, c_val)
    elif min_val == a_val:
        return (b_val, a_val, c_val)
    else:
        return(a_val, c_val, b_val)

def peaks_and_valleys(array):
    size = len(array)
    if size < 3: #All arrays of length less than 3 follow the pattern
        return array
    a,b,c = 0,1,2
    while c < size:
        array[a], array[b], array[c] = zoom_on_three(array, size, a, b, c)#Look at 3 integers at a time
        a += 2
        b += 2
        c += 2
    return array

def check_peaks_and_valleys(array): #Used for testing
    for i in range(len(array)-2):
        if (array[i] <= array[i+1] and array[i+1] <= array[i+2]) or \
            array[i] >= array[i+1] and array[i+1] >= array[i+2]:
            return False
    return True

**Unit tests:** 

In [14]:
import unittest

class Test(unittest.TestCase):
    def test_sorted_merge(self):
        a = [1,3,5,7,9,None,None,None,None]
        b = [2,4,6,8,]
        sorted_merge(a, b)
        self.assertEqual(a, [1,2,3,4,5,6,7,8,9])
    def test_group_anagrams(self):
        input = ["abc", "ghi", "def", "hig", "cba", "fde"]
        output = ["abc", "cba", "def", "fde", "ghi", "hig"]
        self.assertEqual(group_anagrams(input), output)
    def test_search_in_rotated_array(self):
        array = [15,16,19,20,25,1,3,4,5,7,10,14]
        self.assertEqual(search_in_rotated_array(array, 5), 8)
    def test_sorted_search(self):
        input = Listy([1,2,3,4,5,6])
        self.assertEqual(sorted_search(input, 4), 3)
    def test_sparse_search(self):
        input = ["at", "", "", "", "ball", "", "", "car", "", "", "dad", "", ""]
        self.assertEqual(sparse_search(input, "ball"), 4)
    def test_missing_int(self):
        input = [0,1,2,4]
        self.assertEqual(input.count(missing_int(input)), 0)
    def test_missing_int_follow_up(self):
        input = [0,1,2,3,4,5,6,8,9,10,11]
        self.assertEqual(input.count(missing_int_follow_up(input)), 0)      
    def test_find_duplicates(self):
        input = [0,1,2,2,3,4,4]
        self.assertEqual(find_duplicates(input), [2,4]) 
    def test_sorted_matrix_search(self):
        input = [[0, 1, 2, 3, 4, 5, 6 ],
                 [7, 8, 9, 10,11,12,13],
                 [14,15,16,17,18,19,20 ]]
        self.assertEqual(sorted_matrix_search(input,7,14), (2,0)) 
    def test_rank_from_stream(self):
        input = RankBinarySearchTree()
        input.put([10,2,3,4,9,8])
        self.assertEqual(input.getRankOfNumber(8), 3)
    def test_sorted_matrix_search(self):
        input = [1,2,3,4,5,6,7,8,9]
        self.assertTrue(check_peaks_and_valleys(peaks_and_valleys(input))) 

if __name__ == "__main__":
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

..........
----------------------------------------------------------------------
Ran 10 tests in 0.034s

OK
