# Simple LCS

### compute score matix and backtracking matrices
- v = string v
- w = string w

In [1]:
from numpy import *

def findLCS(v, w):
    score = zeros((len(v)+1,len(w)+1), dtype="int32")
    backt = zeros((len(v)+1,len(w)+1), dtype="int32")
    for i in range(1,len(v)+1):
        for j in range(1,len(w)+1):
            # find best score at each vertex
            if (v[i-1] == w[j-1]):  # test for a match ("diagonal street")
                score[i,j], backt[i,j] = max((score[i-1,j-1]+1,3), (score[i-1,j],1), (score[i,j-1],2))
            else:
                score[i,j], backt[i,j] = max((score[i-1,j],1), (score[i,j-1],2))
    return score, backt

### recursively compute the lcs from the backtracking matrix
- b = backtracking matrix
- v = string v
- i = length of string v
- j = length of string w

In [2]:
def LCS(b,v,i,j):
    if ((i == 0) and (j == 0)):
        return ''
    elif (b[i,j] == 3):
        return LCS(b,v,i-1,j-1) + v[i-1]
    elif (b[i,j] == 2):
        return LCS(b,v,i,j-1)
    else:
        return LCS(b,v,i-1,j)

### find the alignments of the two original strings that result in the lcs
- b = backtracking matrix
- v = string v
- w = string w
- i = length of string v
- j = length of string w

In [36]:
def Alignment(b,v,w,i,j):
    if ((i == 0) and (j == 0)):
        return ['','']
    if (b[i,j] == 3):
        result = Alignment(b,v,w,i-1,j-1)
        result[0] += v[i-1]
        result[1] += w[j-1]
        return result
    if (b[i,j] == 2):
        result = Alignment(b,v,w,i,j-1)
        result[0] += "_"
        result[1] += w[j-1]
        return result
    if (b[i,j] == 1):
        result = Alignment(b,v,w,i-1,j)
        result[0] += v[i-1]
        result[1] += "_"
        return result

### Example: Insulin Protein in Human's and Pigs

In [10]:
human = "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN"
pig = "MALWTRLLPLLALLALWAPAPAQAFVNQHLCGSHLVEALYLVCGERGFFYTPKARREAENPQAGAVELGGGLGGLQALALEGPPQKRGIVEQCCTSICSLYQLENYCN"

v = human
w = pig
s, b = findLCS(v,w)

lcs = LCS(b, v, b.shape[0]-1,b.shape[1]-1)
print(lcs, len(lcs))

align = Alignment(b,v,w,b.shape[0]-1,b.shape[1]-1)
print("v =", align[0])
print("w =", align[1])

MALWRLLPLLALLALWPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKRREAEQGLQLALEGQKRGIVEQCCTSICSLYQLENYCN 87
v = MALWM_RLLPLLALLALWGPD_PA_A_AFVNQHLCGSHLVEALYLVCGERGFFYTPKT_RREAEDL__Q_GS__________LQP_LALEGSL__QKRGIVEQCCTSICSLYQLENYCN
w = MALW_TRLLPLLALLALW___APAPAQAFVNQHLCGSHLVEALYLVCGERGFFYTPK_ARREAE__NPQAG_AVELGGGLGGLQ_ALALEG__PPQKRGIVEQCCTSICSLYQLENYCN


# LCS with Scores for Indels, Matches, Mismatches

### compute score and backtracking matrices
use the same LCS and Alignment functions to recover the LCS and alignments
- v = string v
- w = string w
- indel = penalty/reward for indel
- match = penalty/reward for match
- mismatch = penalty/reward for mismatch
- adjacent = penalty/reward for alphabetically adjacent characters (i.e. M,N)

In [40]:
import numpy

def GlobalAlign(v, w, indel, match, mismatch, adjacent):
    s = numpy.zeros((len(v)+1,len(w)+1), dtype="int32")
    b = numpy.zeros((len(v)+1,len(w)+1), dtype="int32")
    for i in range(0,len(v)+1):
        for j in range(0,len(w)+1):
            if (j == 0):
                if (i > 0):
                    s[i,j] = s[i-1,j] + indel
                    b[i,j] = 1
                continue
            if (i == 0):
                s[i,j] = s[i,j-1] + indel
                b[i,j] = 2
                continue
            if abs(ord(v[i-1]) - ord(w[j-1])) == 0: # check if there is a match
                score = s[i-1,j-1] + match
            elif abs(ord(v[i-1]) - ord(w[j-1])) == 1: # check if adjacent
                score = s[i-1,j-1] + adjacent
            else: # check for all other mismatches
                score = s[i-1,j-1] + mismatch
                
            vskip = s[i-1,j] + indel
            wskip = s[i,j-1] + indel
            s[i,j] = max(vskip, wskip, score)
            if (s[i,j] == vskip):
                b[i,j] = 1
            elif (s[i,j] == wskip):
                b[i,j] = 2
            else:
                b[i,j] = 3
    return (s, b)

# Typical alignment game from lecture 15
    # indel: -1
    # match: +1
    # mismatch: +/- 0
    
w = "GCTGGAAGGCAT"
v = "GCAGAGCACT"

s, b = GlobalAlign(v, w, -1, 1, 0, 0)
print(s)
print()
print(b)
lcs = LCS(b, v, b.shape[0]-1,b.shape[1]-1)
print()
print(lcs)

[[  0  -1  -2  -3  -4  -5  -6  -7  -8  -9 -10 -11 -12]
 [ -1   1   0  -1  -2  -3  -4  -5  -6  -7  -8  -9 -10]
 [ -2   0   2   1   0  -1  -2  -3  -4  -5  -6  -7  -8]
 [ -3  -1   1   2   1   0   0  -1  -2  -3  -4  -5  -6]
 [ -4  -2   0   1   3   2   1   0   0  -1  -2  -3  -4]
 [ -5  -3  -1   0   2   3   3   2   1   0  -1  -1  -2]
 [ -6  -4  -2  -1   1   3   3   3   3   2   1   0  -1]
 [ -7  -5  -3  -2   0   2   3   3   3   3   3   2   1]
 [ -8  -6  -4  -3  -1   1   3   4   3   3   3   4   3]
 [ -9  -7  -5  -4  -2   0   2   3   4   3   4   3   4]
 [-10  -8  -6  -4  -3  -1   1   2   3   4   3   4   4]]

[[0 2 2 2 2 2 2 2 2 2 2 2 2]
 [1 3 2 2 2 2 2 2 2 2 2 2 2]
 [1 1 3 2 2 2 2 2 2 2 2 2 2]
 [1 1 1 3 2 2 3 2 2 2 2 2 2]
 [1 1 1 1 3 2 2 2 3 2 2 2 2]
 [1 1 1 1 1 3 3 2 2 2 2 3 2]
 [1 1 1 1 1 3 3 3 3 2 2 2 2]
 [1 1 1 1 1 1 3 3 3 3 3 2 2]
 [1 1 1 1 1 1 3 3 2 3 3 3 2]
 [1 1 1 1 1 1 1 1 3 2 3 1 3]
 [1 1 1 3 1 1 1 1 1 3 1 3 3]]

GCAGAGCAT


# Global Alignment with Scoring Matrix

### compute score and backtracking matrices

use the same LCS and Alignment functions to recover the LCS and alignments

- v = string v
- w = string w
- scorematrix = dictionary that maps a tuple ("A", "C") to a score reflecting the mutation probability
- indel = penalty for an indel

In [39]:
import numpy

def GlobalAlignSM(v, w, scorematrix, indel):
    s = numpy.zeros((len(v)+1,len(w)+1), dtype="int32")
    b = numpy.zeros((len(v)+1,len(w)+1), dtype="int32")
    for i in range(1,len(v)+1):
        for j in range(1,len(w)+1):
            if (j == 0):
                if (i > 0):
                    s[i,j] = s[i-1,j] + indel
                    b[i,j] = 1
                continue
            if (i == 0):
                s[i,j] = s[i,j-1] + indel
                b[i,j] = 2
                continue
            score = s[i-1,j-1] + scorematrix[v[i-1],w[j-1]]
            vskip = s[i-1,j] + indel
            wskip = s[i,j-1] + indel
            s[i,j] = max(vskip, wskip, score)
            if (s[i,j] == vskip):
                b[i,j] = 1
            elif (s[i,j] == wskip):
                b[i,j] = 2
            elif (s[i,j] == score):
                b[i,j] = 3
            else:
                b[i,j] = 0
    return (s, b)

match = {('A','A'):  5, ('A','C'): -4, ('A','G'): -4, ('A','T'): -4,
         ('C','A'): -4, ('C','C'):  5, ('C','G'): -4, ('C','T'): -4,
         ('G','A'): -4, ('G','C'): -4, ('G','G'):  5, ('G','T'): -4,
         ('T','A'): -4, ('T','C'): -4, ('T','G'): -4, ('T','T'):  5}

w = "GCTGGAAGGCAT"
v = "GCAGAGCACT"

s, b = GlobalAlignSM(v,w,match,-1)
print(s)
print()
print(b)
lcs = LCS(b, v, b.shape[0]-1,b.shape[1]-1)
print()
print(lcs)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  5  4  3  5  5  4  3  5  5  4  3  2]
 [ 0  4 10  9  8  7  6  5  4  4 10  9  8]
 [ 0  3  9  8  7  6 12 11 10  9  9 15 14]
 [ 0  5  8  7 13 12 11 10 16 15 14 14 13]
 [ 0  4  7  6 12 11 17 16 15 14 13 19 18]
 [ 0  5  6  5 11 17 16 15 21 20 19 18 17]
 [ 0  4 10  9 10 16 15 14 20 19 25 24 23]
 [ 0  3  9  8  9 15 21 20 19 18 24 30 29]
 [ 0  2  8  7  8 14 20 19 18 17 23 29 28]
 [ 0  1  7 13 12 13 19 18 17 16 22 28 34]]

[[0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 3 2 2 3 3 2 2 3 3 2 2 2]
 [0 1 3 2 2 2 2 2 1 1 3 2 2]
 [0 1 1 1 1 1 3 2 2 2 1 3 2]
 [0 3 1 1 3 2 1 1 3 2 2 1 1]
 [0 1 1 1 1 1 3 2 1 1 1 3 2]
 [0 3 1 1 1 3 1 1 3 2 2 1 1]
 [0 1 3 2 1 1 1 1 1 1 3 2 2]
 [0 1 1 1 1 1 3 2 1 1 1 3 2]
 [0 1 1 1 1 1 1 1 1 1 1 1 1]
 [0 1 1 3 2 1 1 1 1 1 1 1 3]]

GCGAGCAT


# Local Alignment with Scoring Matrix

### compute score and backtracking matrices

- v = string v
- w = string w
- scorematrix = dictionary that maps a tuple ("A", "C") to a score reflecting the mutation probability
- indel = penalty for an indel

In [42]:
import numpy

def LocalAlign(v, w, scorematrix, indel):
    s = numpy.zeros((len(v)+1,len(w)+1), dtype="int32")
    b = numpy.zeros((len(v)+1,len(w)+1), dtype="int32")
    for i in range(1,len(v)+1):
        for j in range(1,len(w)+1):
            if (j == 0):
                if (i > 0):
                    s[i,j] = max(s[i-1,j] + indel, 0)
                    b[i,j] = 1
                continue
            if (i == 0):
                s[i,j] = max(s[i,j-1] + indel, 0)
                b[i,j] = 2
                continue
            score = s[i-1,j-1] + scorematrix[v[i-1],w[j-1]]
            vskip = s[i-1,j] + indel
            wskip = s[i,j-1] + indel
            s[i,j] = max(vskip, wskip, score, 0)
            if (s[i,j] == vskip):
                b[i,j] = 1
            elif (s[i,j] == wskip):
                b[i,j] = 2
            elif (s[i,j] == score):
                b[i,j] = 3
            else:
                b[i,j] = 0
    return (s, b)

match = {('A','A'):  5, ('A','C'): -4, ('A','G'): -4, ('A','T'): -4,
         ('C','A'): -4, ('C','C'):  5, ('C','G'): -4, ('C','T'): -4,
         ('G','A'): -4, ('G','C'): -4, ('G','G'):  5, ('G','T'): -4,
         ('T','A'): -4, ('T','C'): -4, ('T','G'): -4, ('T','T'):  5}

w = "GCTGGAAGGCAT"
v = "GCAGAGCACT"

s, b = LocalAlign(v,w,match,-7)
print(s)
print()
print(b)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  5  0  0  5  5  0  0  5  5  0  0  0]
 [ 0  0 10  3  0  1  1  0  0  1 10  3  0]
 [ 0  0  3  6  0  0  6  6  0  0  3 15  8]
 [ 0  5  0  0 11  5  0  2 11  5  0  8 11]
 [ 0  0  1  0  4  7 10  5  4  7  1  5  4]
 [ 0  5  0  0  5  9  3  6 10  9  3  0  1]
 [ 0  0 10  3  0  2  5  0  3  6 14  7  0]
 [ 0  0  3  6  0  0  7 10  3  0  7 19 12]
 [ 0  0  5  0  2  0  0  3  6  0  5 12 15]
 [ 0  0  0 10  3  0  0  0  0  2  0  5 17]]

[[0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 3 0 0 3 3 0 0 3 3 0 0 0]
 [0 0 3 2 0 3 3 0 0 3 3 2 0]
 [0 0 1 3 0 0 3 3 0 0 1 3 2]
 [0 3 0 0 3 3 0 3 3 3 0 1 3]
 [0 0 3 0 1 3 3 3 1 3 3 3 1]
 [0 3 0 0 3 3 1 3 3 3 3 0 3]
 [0 0 3 2 0 1 3 0 1 3 3 2 2]
 [0 0 1 3 0 0 3 3 2 0 1 3 2]
 [0 0 3 0 3 0 1 1 3 0 3 1 3]
 [0 0 0 3 2 0 0 0 0 3 0 1 3]]


### recursively compute the local lcs from the backtracking matrix
- b = backtracking matrix
- v = string v
- w = string w
- i = length of string v
- j = length of string w

In [43]:
def LocalAlignment(b,v,w,i,j):
    if (b[i,j] == 0):
        return ['','']
    if (b[i,j] == 3):
        result = LocalAlignment(b,v,w,i-1,j-1)
        result[0] += v[i-1]
        result[1] += w[j-1]
        return result
    if (b[i,j] == 2):
        result = LocalAlignment(b,v,w,i,j-1)
        result[0] += "_"
        result[1] += w[j-1]
        return result
    if (b[i,j] == 1):
        result = LocalAlignment(b,v,w,i-1,j)
        result[0] += v[i-1]
        result[1] += "_"
        return result
    
maxij = numpy.unravel_index(s.argmax(), s.shape)
print(maxij, s[maxij])
vsub, wsub = LocalAlignment(b,v,w,maxij[0],maxij[1])
print(v)
print(w)
print(vsub)
print(wsub)

(8, 11) 19
GCAGAGCACT
GCTGGAAGGCAT
GCAGAGCA
GAAG_GCA
