# Blast and Alternatives
Finding similar sequences based on sequence alignment algorithms

K-mer indexing scheme (strategy)

## Sequence similarity
Useful as a metric between sequences of strings

In [41]:
import operator
import pandas as pd

In [42]:
def read_submat(filename="blosum62.mat"):
    df = pd.read_csv(filename, header=0, delimiter="\t")
    alphabet = list(df.columns)
    sm = {x+y:df.loc[i][j]  for i,x in enumerate(alphabet) for j,y in enumerate(alphabet)}
    return df, alphabet, sm

In [43]:
def max_mat(mat):
    """finds the max cell in the matrix"""
    maxval = mat[0][0]
    maxrow = 0
    maxcol = 0
    for i in range(0,len(mat)):
        for j in range(0, len(mat[i])):
            if mat[i][j] > maxval:
                maxval = mat[i][j]
                maxrow = i
                maxcol = j
    return (maxrow, maxcol)

In [44]:
def recover_align_local (S, T, seq1, seq2):
    """recover one of the optimal alignments"""
    res = ["", ""]
    """determine the cell with max score"""
    i, j = max_mat(S)
    """terminates when finds a cell with zero"""
    while T[i][j]>0:
        if T[i][j]==1:
            res[0] = seq1[i-1] + res[0]
            res[1] = seq2[j-1] + res[1]
            i -= 1
            j -= 1
        elif T[i][j] == 3:
            res[0] = "-" + res[0];
            res[1] = seq2[j-1] + res[1]
            j -= 1
        elif T[i][j] == 2:
            res[0] = seq1[i-1] + res[0]
            res[1] = "-" + res[1]
            i -= 1
    return res

In [45]:
def score_col_alignment(s1, s2, sm, g):
    return g if s1=="_" or s2=="_" else sm[s1+s2]

In [51]:
def max3t(v1, v2, v3):
    """Indicates which of the given integers is bigger: 1 2 or 3"""
    if v1 > v2:
        return 1 if v1 > v3 else 3
    else:
        return 2 if v2 > v3 else 3

In [52]:
def smith_waterman(seq1, seq2, sm, g):
    """Local alignment"""
    S = [[0]]
    T = [[0]]
    maxscore = 0
    # first row filled with zero
    for j in range(1, len(seq2)+1):
        S[0].append(0)
        T[0].append(0)
    # first column filled with zero
    for i in range(1, len(seq1)+1):
        S.append([0])
        T.append([0])
    for i in range(0, len(seq1)):
        for j in range(len(seq2)):
            s1 = S[i][j] + score_col_alignment(seq1[i], seq2[j], sm, g);
            s2 = S[i][j+1] + g
            s3 = S[i+1][j] + g
            b = max(s1, s2, s3)
            if b <= 0:
                S[i+1].append(0)
                T[i+1].append(0)
            else:
                S[i+1].append(b)
                T[i+1].append(max3t(s1, s2, s3))
                if b > maxscore:
                    maxscore = b
    return (S, T, maxscore)

In [71]:
def test_local_alig():
    sm = read_submat_file('blosum62.mat')
    seq1 = "PHSWG"
    seq2 = "HGWAG"
    res = smith_waterman(seq1, seq2, sm, -8)
    S = res[0]
    T = res[1]
    print("Score of optimal alignment:", res[2])
    #print_mat(S)
    #print_mat(T)
    print(S)
    print(T)
    alinL = recover_align_local(S, T, seq1, seq2)
    print(alinL[0])
    print(alinL[1]) 

In [82]:
# Find the most similar sequence to a query sequence in a "database"
# using local alignment
# query sequence, database of sequences, substitution matrix, gap penalty
def find_similar(query, list_of_seqs, sm, g):
    # smith_waterman returns (S, T, score)
    (S, T, score), seq = max([(smith_waterman(query, s, sm, g), s) for s in list_of_seqs], key=lambda x: x[0][2])
    print(S, T, score, seq)
#     return recover_align_needleman_wunsch(query, seq, s, t), score

In [83]:
df, alf, sm = read_submat()
find_similar("ACTG", ["ACTTGC", "ATATAT", "CATAGA"], sm, g=1)

[[0, 0, 0, 0, 0, 0, 0], [0, 4, 5, 6, 7, 8, 9], [0, 5, 13, 14, 15, 16, 17], [0, 6, 14, 18, 19, 20, 21], [0, 7, 15, 19, 20, 25, 26]] [[0, 0, 0, 0, 0, 0, 0], [0, 1, 3, 3, 3, 3, 3], [0, 2, 1, 3, 3, 3, 3], [0, 2, 2, 1, 3, 3, 3], [0, 2, 2, 2, 3, 1, 3]] 26 ACTTGC


# Para o exame
Pode ser pedido um `align_query(query, ls, ms, g)`. O exame é escrito, mas terá sempre componente prática. Preencher matriz de alinhamento e escrever e a outra metade escrever código. É possível consultar material das aulas em papel. 

# BLAST
Basic Local Alignment Search Tool - a lot of programs

Find regions of sequences that are locally similar

Hashmap with partial match, assumes that keys can translate amount of similarity, could be interesting to improve search times