In [0]:
# Problem 1: Build a simple, list based k-mer index of a string to be searched

# Example adapted from Ben Langmead (thanks!)

import bisect
import sys

class Index(object):
    def __init__(self, t, k):
      ''' Create index from all substrings of size 'length' '''
      self.t = t
      self.k = k  # k-mer length (k)
      self.index = []
        
      # Code to complete:
      # For each k-mer add (k-mer, offset) pair to index 
      # and then sort k-mers in lexicographic order
        
    
    def queryKmer(self, kmer):
      ''' Return locations of kmer in t'''
        
      assert len(kmer) == self.k
        
      hits = [] 
        
      # Code to complete:
      # Find first location of kmer in self.index (hint: use bisect.bisect_left function)
      # Iterate through self.index from first location of kmer to last adding matches to hits
      i = bisect.bisect_left(self.t, kmer)
      for value in range(i, len(self.t)):
        if self.t[value:value + self.k] == kmer:
          hits.append(value)   
      return hits
    
    def query(self, p):
      ''' Return occurrences of pattern p in t'''
      kmer = p[:self.k]
      occurrences = []
      # Code to complete:
      # Use self.queryKmer to find locations of prefix kmer of p in t
      # For each location, ascertain if suffix of p matches the corresponding substring
      # of t, returning occurrences
      hits = self.queryKmer(kmer)
      lenP = len(p)
      for position in hits:
        if self.t[position: position + lenP] == p:
          occurrences.append(position)
            
      return occurrences
      
     
text = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'
pattern = 'GGTATTCGGGA'
K = 3

index = Index(text, K)

In [0]:
# Test queryKmer method
index.queryKmer("GGT") == [21, 64, 68]

True

In [0]:
# Test query method
index.query(pattern) == [21, 68]

True

In [0]:
# Report index specificity
float(len(index.query(pattern)))/len(index.queryKmer(pattern[:K]))

0.6666666666666666

In [0]:
# Problem 2: Build a simple suffix array


class SuffixArray(object):
    def __init__(self, t):
      ''' Create suffix array representing suffixes in t '''
      
      self.td = t + "$"
      self.index = [i for i in range(len(self.td))] ## Array of integers representing lexicographically sorted suffixes of t
      # e.g. for t$ = ATA$
      # have suffixes
      # 0 = ATA$
      # 1 = TA$
      # 2 = A$
      # 3 = $
      # such that self.index == [ 3, 2, 0, 1 ]
      
      # Code to complete - finish building self.index for t
      self.index.sort(key=lambda x: self.td[x:])
      
    def suffix(self, item, p):
      ''' Return the given suffix. '''
      return self.td[item:]
    
    def query(self, p):
      ''' Return occurrences of pattern p in t'''
      
      # Code to complete - find all occurrences of p in t by writing binary search
      # function on self.index
      left = int(0)
      right = int(len(self.index))

      while left < right:
        mid = (left + right)//2
        if p > self.suffix(self.index[mid], p):
          left = mid + 1
        else:
          right = mid
      
      foundSuffix = left; right = len(self.index)
      
      left = left + 1
      while left < right:
        mid = (left + right)//2
        if p < self.suffix(self.index[mid], p):
          right = mid
        else:
          left = mid + 1
          
      return self.index[foundSuffix], self.index[right]


      

In [0]:
# Test suffix array construction
sa = SuffixArray("ATA")
sa.index == [ 3, 2, 0, 1 ]

[3, 2, 0, 1]


True

In [30]:
# Test suffix array search
sa = SuffixArray(text)
print(sa.query(pattern))
sorted(sa.query(pattern)) == [21, 68]

(68, 21)


True