# Naive String Matcher algorithm

In [1]:
def NSM(T, P):
  '''
  Finds a fragment P in a text T by use of Naive string matcher algorithm

  Params:
  T - str, a text in which the search will be implemented
  P - str, a substring to be found in T

  Output:
  s - array of integers, the shifts (the indices where P is found in T)
  '''
  shifts = []
  n, m = len(T), len(P)
  for s in range(n-m + 1):
        if P == T[s+1 : s+m+1]:
            shifts.append(s)
  return shifts

In [3]:
t = 'naive string matcher alg'
p = 'al'

NSM(t, p)

[20]

In [4]:
t = 'naive string matcher alg'
p = 'dlw'

NSM(t, p)

[]

In [6]:
t = 'naive string matcher alg'
p = 'a'

NSM(t, p)

[0, 13, 20]

# Rabin-Karp algorithm

3 versions of python implementation if RK algorithm are considered here:

1. Digit string case
2. Strings with any symbols
3. Algorithm with a use of a ready hash function

In [10]:
# Digit case
def RabKarp(T, P, d=10):
    '''
    Finds a fragment P in a text T by use of Rabin-Karp algorithm.
    Only digital text is considered here.

    Params:
    T - str, a text in which the search will be implemented
    P - str, a substring to be found in T
    d - int, the length of the dictionary of symbols used;

    Output:
    s - array of integers, the shifts (the indices where P is found in T)

    '''
    shifts = []
    n, m = len(T), len(P)
    # Selecting the q
    for q in range(2, n**3):
        h = d**(m-1) % q
        if h == int(P[0]):
            break

    p, t = 0, 0
    
    # Finding the coded form of P and T[0:m+1]
    for i in range(m):
        p = (d*p + int(P[i])) % q
        t = (d*t + int(T[i])) % q

    # Comparing with different slices of T
    for s in range(n-m+1):
        if p == t:
            if int(P) == int(T[s : s+m]):
                shifts.append(s)
        if s < n - m:
            t = (d*(t - int(T[s])*h) + int(T[s+m]))%q
    
    return shifts

In [11]:
T = '49503829587592023829'
P = '950'
d = 10
RabKarp(T, P, 10)

[1]

In [12]:
T = '49503829587592023829'
P = '95'
d = 10
RabKarp(T, P, 10)

[1, 7]

In [13]:
T = '49503829587592023829'
P = '9'
d = 10
RabKarp(T, P, 10)

[1, 7, 12, 19]

In [15]:
# Adapting to the case with symbols
def RabKarp_str(T, P):
    '''
    Finds a fragment P in a text T by use of Rabin-Karp algorithm.

    Params:
    T - str, a text in which the search will be implemented
    P - str, a substring to be found in T

    Output:
    s - array of integers, the shifts (the indices where P is found in T)
    '''
    shifts = []
    n, m = len(T), len(P)

    # Creating a dictionary from used symbols with a length d
    alphabet = set(T)
    d = len(alphabet)
    h_dict = {l:i for i, l in enumerate(alphabet)}

    # Selecting the q
    for q in range(2, n**3):
        h = d**(m-1) % q
        if h == h_dict[P[0]]:
            break
            
    p, t = 0, 0
    
    for i in range(m):
        p = (d*p + h_dict[P[i]]) % q
        t = (d*t + h_dict[T[i]]) % q
    for s in range(n-m+1):
        if p == t:
            if P == T[s : s+m]:
                shifts.append(s)
        if s < n - m:
            t = (d*(t - h_dict[T[s]]*h) + h_dict[T[s+m]])%q
    
    return shifts

In [17]:
T='dkdl dld lleed'
P = 'dld'

RabKarp_str(T, P)

[5]

In [18]:
T = 'djkss;; w[[[w jjedw dj'
P = 'dj'

RabKarp_str(T, P)

[0, 20]

In [21]:
# String case using hash function
def RabinKarp_hash(T, P):
    '''
    Finds a fragment P in a text T by use of Rabin-Karp algorithm.

    Params:
    T - str, a text in which the search will be implemented
    P - str, a substring to be found in T

    Output:
    s - array of integers, the shifts (the indices where P is found in T)
    '''
    n, m = len(T), len(P)
    p = hash(P);
    shifts = []

    for s in range(n-m+1):
        t = hash(T[s:s+m])
        if t == p:
            if T[s:s+m] == P:
                shifts.append(s)
    return shifts

In [22]:
T = 'Empty spaces, what are we living for... Abandoned places...'
P = 'living'

RabinKarp_hash(T, P)

[26]

In [23]:
T = 'Empty spaces, what are we living for... Abandoned places...'
P = 'mine'

RabinKarp_hash(T, P)

[]

In [24]:
T = 'Empty spaces, what are we living for... Abandoned places...'
P = 'e'

RabinKarp_hash(T, P)

[10, 21, 24, 47, 54]