# 19.0 Advanced Algorithms

## 19.1 Rabin-Karp

### Problem Statement
Given an input sequence of size $n$ and a pattern of size $m$, implement a function to find all occurences of a pattern in the sequence by making on average $O(n+m)$ comparisions between sequence and pattern instead of the $O(nm)$ number of comparisions made using brute force pattern matching.

In [1]:
from collections import namedtuple
import unittest


BASE = 131  # Next prime after 2^7.


def rolling_hash(string, base=BASE, prevhash=None, prevchar=None):
    """Compute rolling hash from previous hash value and character."""

    newhash, m = 0, len(string)
    
    # Compute the full hash.
    if prevhash is None:
        for ind, c in enumerate(string):
            newhash = (newhash*base) + ord(c)  # Horner's rule.
    # Compute a rolling hash by removing the contribution of the 
    # previous character and adding contribution of last character.
    else:
        newhash = prevhash
        newhash -= base**(m-1) * ord(prevchar)
        newhash = (newhash*base) + ord(string[-1])

    return newhash


def patmatch(string, pattern):
    """Return start index of all occurences of pattern in string."""

    matches = []
    if len(pattern) > len(string):
        return matches

    # Hash the pattern and then step through the string char-by-char
    # computing rolling hash of characters in a window of len(pattern).
    # If the rolling hash matches the pattern hash, then perform a 
    # full comparision (to avoid spurious false positives) and, if
    # matches, then add the index of the character to the matches.
    # The loop terminates when reaching len(string)-len(pattern)+1.
    pathash = rolling_hash(pattern)
    strhash, prevchar, ind = None, None, 0
    
    while ind < len(string)-len(pattern)+1:
        substr = string[ind:ind+len(pattern)]
        strhash = rolling_hash(substr, prevhash=strhash, 
                               prevchar=prevchar)
        if strhash == pathash and substr == pattern:
            matches.append(ind)
        prevchar = substr[0]
        ind += 1

    return matches


class PatmatchTest(unittest.TestCase):

    def test_rolling_hash(self):
        cases = [
            'hello',
            'world',
            'rolling',
            'hash'
        ]
        for string in cases:
            # Compute the full hash of the input.
            h1 = rolling_hash(string)
            # Duplicate the input.  Step through char-by-char
            # computing the rolling hash and compare the result
            # to the full hash after stepping through len(string).
            s2, h2, ind, n = string+string, h1, 1, len(string)
            while ind <= n:
                h2 = rolling_hash(s2[ind:ind+n], 
                                  prevhash=h2, prevchar=s2[ind-1])
                ind += 1
            self.assertEqual(h1, h2)

    def test_patmatch(self):
        case = namedtuple('case', ['string','pattern','expected'])
        cases = [
            case('abracadabra', 'abr', [0,7]),
            case('hello', 'hello', [0]),
            case('hellohello', 'hello', [0,5]),
            # Negative example.
            case('hello', 'hallo', []),
            # Some numerical examples as strings.
            case('0123456789','0123', [0]),
            case('0123456789','3456', [3]),
            case('0123456789','6789', [6]),
            case('0123456789','019', []),
            case('0123456789','349', []),
            case('0123456789','780', []),
        ]
        for c in cases:
            rcv = patmatch(c.string, c.pattern)
            self.assertEqual(rcv, c.expected)


unittest.main(PatmatchTest(), argv=[''], verbosity=2, exit=False)

test_patmatch (__main__.PatmatchTest) ... ok
test_rolling_hash (__main__.PatmatchTest) ... ok

----------------------------------------------------------------------
Ran 2 tests in 0.005s

OK


<unittest.main.TestProgram at 0x7f1f443c1588>