In [0]:
# Problem 1: Build a simple, list based k-mer index of a string to be searched

# Example adapted from Ben Langmead (thanks!)

import bisect
import sys

class Index(object):
    def __init__(self, t, k):
      ''' Create index from all substrings of size 'length' '''
      self.t = t
      self.k = k  # k-mer length (k)
      self.index = []
        
      # Code to complete:
      # For each k-mer add (k-mer, offset) pair to index 
      # and the sort k-mers in lexicographic order
      for i in range(0,len(t)-k+1):
        self.index.append((t[i:i+k],i))
      self.index.sort()
        
    
    def queryKmer(self, kmer):
      '''Return locations of kmer in t'''
        
      assert len(kmer) == self.k
        
      hits = [] 
        
      # Code to complete:
      # Find first location of kmer in self.index (hint: use bisect.bisect_left function)
      # Iterate through self.index from first location of kmer to last adding matches to hits
      
      i = bisect.bisect_left(self.index,(kmer,-1))
      while i < len(self.index):
        if self.index[i][0] != kmer: break
        hits.append(self.index[i][1])
        i += 1
            
      return hits
    
    def query(self, p):
      ''' Return occurrences of pattern p in t'''
      kmer = p[:self.k]
        
      occurrences = []
        
      # Code to complete:
      # Use self.queryKmer to find locations of prefix kmer of p in t
      # For each location, ascertain if suffix of p matches the corresponding substring
      # of t, returning occurrences
      
      hits = self.queryKmer(kmer)
      for hit in hits:
        if p[self.k:] == self.t[(hit+self.k):(hit+len(p))]:
          occurrences.append(hit)
            
      return occurrences
      
     
text = 'ACTTGGAGATCTTTGAGGCTAGGTATTCGGGATCGAAGCTCATTTCGGGGATCGATTACGATATGGTGGGTATTCGGGA'
pattern = 'GGTATTCGGGA'
K = 3

index = Index(text, K)

In [0]:
# Test queryKmer method
index.queryKmer("GGT") == [21, 64, 68]

True

In [0]:
# Test query method
index.query(pattern) == [21, 68]

True

In [0]:
# Report index specificity
float(len(index.query(pattern)))/len(index.queryKmer(pattern[:K]))

0.6666666666666666

In [0]:
# Problem 2: Build a simple suffix array


class SuffixArray(object):
    def __init__(self, t):
      ''' Create suffix array representing suffixes in t '''
      
      self.td = t + "$"
      self.index = [] ## Array of integers representing lexicographically sorted suffixes of t
      # e.g. for t$ = ATA$
      # have suffixes
      # 0 = ATA$
      # 1 = TA$
      # 2 = A$
      # 3 = $
      # such that self.index == [ 3, 2, 0, 1 ]
      
      # Code to complete - finish building self.index for t
      temp_list = []
      for i in range(len(self.td)):
        temp_list.append((self.td[i:],i))
      temp_list.sort()
      self.index = [i[1] for i in temp_list]
      
    
    def query(self, p):
      ''' Return occurrences of pattern p in t'''
      
      query = []
      
      # Code to complete - find all occurrences of p in t by writing binary search
      # function on self.index
      left = 0
      right = len(self.td)-1
      while left < right:
        m = int((left+right)/2)
        if p <self.td[self.index[m]:]:
          right = m
          
        else: 
          left = m + 1
      query.append(self.index[left])
          
      right = len(self.td)-1
      while left < right:
        m = int((left+right)/2)
        if self.td[self.index[right]:self.index[right]+len(p)] != p:
          right = m
        else: 
          left = m + 1
      query.append(self.index[left])    
          
      return query
      
      
      
      
      

In [0]:
# Test suffix array construction
sa = SuffixArray("ATA")
sa.index == [ 3, 2, 0, 1 ]

True

In [0]:
# Test suffix array search
sa = SuffixArray(text)
sorted(sa.query(pattern)) == [21, 68]

True