## BA5F

**Local Alignment Problem**

Find the highest-scoring local alignment between two strings.

Given: Two amino acid strings.

Return: The maximum score of a local alignment of the strings, followed by a local alignment of these strings achieving the maximum score. Use the PAM250 scoring matrix and indel penalty σ = 5. (If multiple local alignments achieving the maximum score exist, you may return any one.)

Link: https://rosalind.info/problems/ba5f/

In [None]:
import numpy as np
import pandas as pd

In [None]:
def InsertIndel(word,i):
  return word[:i] + '-' + word[i:]

In [None]:
# tablicu pohranjujemo u obliku pandas dataframea
class PAM250(object):
    def __init__(self):
        self.scoring_matrix = pd.read_table('/content/PAM250.txt', sep='  ')

    def __getitem__(self, pair):
        return self.scoring_matrix[pair[0]][pair[1]]

In [None]:
def LocalAlignment(v, w, scoring_matrix):

    sigma = 5
    n = len(v)
    m = len(w)

    S = np.zeros([n+1, m+1],dtype = int)
    backtrack = np.zeros([n+1, m+1],dtype = int)


    # popunjavamo S i backtracking
    for i in range(1, n+1):
      for j in range(1, m+1):
        scores = [S[i-1][j] - sigma, S[i][j-1] - sigma, S[i-1][j-1] + scoring_matrix[v[i-1], w[j-1]],0]
        S[i][j] = max(scores)
        # pohranjujemo koju smo vrijednost uzeli kao max
        # ako smo uzeli od gore - pohranjujemo 0, s lijeva - pohranjujemo 1, dijagonalno - pohranjujemo 2, nula - pohranjujemo 3
        backtrack[i][j] = scores.index(S[i][j])
   
    # dohvacamo "koordinate" najveceg elementa u matrici
    i,j = np.unravel_index(np.argmax(S), S.shape)

    # najveca vrijednost
    max_score = str(S[i][j])

    # high score u ovom slučaju nije nuzno zadnji element kao u globalnom, pa promatramo do pozicije high scorea
    aligned_v, aligned_w = v[:i], w[:j]

    # backtracking do pocetka local alignmenta
    while i*j != 0:
        if backtrack[i][j] == 0: # deletion
            i-= 1
            aligned_w = InsertIndel(aligned_w, j) # insertion
        elif backtrack[i][j] == 1:
            j -= 1
            aligned_v = InsertIndel(aligned_v, i)
        elif backtrack[i][j] == 2: # match 
            i -= 1
            j -= 1
        elif backtrack[i][j] == 3: # 0 
          break

    # nakon sto izademo iz petlje, dosli smo do pocetka local alignmenta, zato i: i j:
    return max_score, aligned_v[i:],  aligned_w[j:]

In [None]:
# sample dataset
result = LocalAlignment('MEANLY','PENALTY', PAM250())

In [None]:
print('\n'.join(result))

15
EANL-Y
ENALTY


In [None]:
# dataset
result = LocalAlignment('RTDLVMGRFSWYEREDHMVYWRRKIMRISQTSNCLCFCVVLLFTVNRSCHQIKVEPDLSWSPSQEGRHQGDSAKGLWYCLMFYIWQWLATHYLEHDMLNAFCAAKFHKTPSEDRSRFHPVTGNMSHKQSDDGHCSYDGSPHQYWMHCMSQAAEQRRAQHNPWASHPLWERHFKHWYKLMYQKSYSADARHCGPYDANTAHSHHELTIHHVETQCQLRAPERKGAKNDSRHLGREWHDLTPWQHPRNRYMELCRANFCDVCHQQIMGLMLCVLFWTCFTKYHSRKIVNWGSRMKGNCDGSIEMPGVTDWVAAATCMQWHQFKHNDDNFAHWYIALWGFMLCEKPAVWGGLFKHECMYIVPYTFVMDPQQKDKCWCIKPIQHMQMLNCCYNHQEEMREGVPIFYWLTWGGAEFANSNGGAVMYIWQKEPASLFCLREEVEVWQFDDFLFQNWELQIFGGQVRKCNWLYTSAVPTMIHGMWYADVKEYRYVPFGETGYKPCHKQGNMNRRPPNRCCIKKGKCVGGKPTTEMFSGWEFRHIIFTQVGYPILNMSPFHSPVAWRSQAVCMSPKECWNGIQLSCVQNIEVAEFYEWQAKYDAFVMYPMGDWCFLIATWHHKPWTATTSIYRHPRWACEPTKDGFLAGWDINMYIQAHDYKKQRFDDRWAIERSVRTVFNCEPACFDLAFTLTFKCDLHFKHKCKNPQLIWQKNDAALIFWNYINHEWMEWRYYMTPLGYWWTRWYDNSPLIRAFRSIDPSTTSISLRCKFRHKSAWEVWCLKEHILWARYNMFGWRDCMGLGMGWACEESRPQTGWVMVQSLIRLTIKNLKEGEKVVEQFQVLQNLEFKDVFNDHMPHGQCDWLDADMMQWTAWLGFEENKPVKWWFPLQLWDLNLGPWDQERIHSHHYQIRNFLMVHLFTKPQWWEKRHRDYWVRMPEPKWLIHSNVQFVSLTRQAVRVHTVCQKEGRIMRKKINSKFLHALHSDDYFQIIKMWESMQSFYYGVMQYQYWFNPGIFENSMWIHIGPMPVDHKRMSHKIRGEYQCEIMQKHDPNAFMGLERQANDSVDIMHPMDCLKWKHATQNDCIHRSIATSTRYRYMGEFYRWFIIICGFWPYTINQVHHDNQEWHNHSRAKVRQLDAGGMNMGNCMSPDQWREAHSFTLKQALKTQWGMMPLISFVDMMKRAGYNVVVRSWKVCCESKLACGYWVPHCRSRMLWVKWIPKDFPSEERECCHNMEARFPSYFYMLLRPLPHYDEQSSSNIHEFFSHVSTCHEHRGQSWQNGHQKDQRFWCHVQELTQPFYDGFGQRMIQWIYRLQDDKKMSVWHLTKQTIMVAPHYLRPECTDRPRWSKFDFAEQTYGEPIMRAECHGEVEQRVTFIWYPMLHWRWPWQDMSTIPPMNRESLIYATCQVADARDPRGWGQMFQNSCTVPPQQTHTCKSWRQIDDMWCEPPMCKKQSMVWFNSCNVGENSYMRLIATGYQKLWVLSKHSGFVMPDTNPEQGLIAEHNEIYDAWGPKCSCDADTEAHWQYKGPDQNRQGLPPLHYEWILARHFTHCWTGTKWCIQISMGNYSNSDNQFNYETAGHPLEVDHYWNRYAWGYCCIAHREIWWIKRGVLQYGREWNQENKLNVQRMCMATCGEQPYWHNAVNEHPAHEVYSEEVWYQMHIASMYMTKTELWLPMFFPTLYSRMFNMWVLIDRAFVIHIMKSLDNDQLDDRIGIIEPMRLSVQREMSEFTYSKSFSEWPIAKCWKGYRVDMMIALYGIQCHVFDMAWWHPCYIGGVRIVSHTFVMDHYLIWISDRMPVGQIEREKAFQYRFFNYAWHNFQHSCNNFGGGRINNRSFYDGMWERLHDEFFEGQFEDEMAIFLQEMMAGGNSTVGMQITPGKLLSFYRTSSKPSFDLTSFYGWSHAFVITCFNHDFRDEKVAQSGIFTKCNFKLILAPRTQEADIEPMCFPSWFIDVAKWWEPQTWTNCEKQAMKFGNYYMHPLCWPDRATNQPMVQSDQLAPQGHLLLNNSDTWFVNDLSSAAIVMAAFHCWQGTQIYKPWHIVHNDVCWHYCCEYKQSARSNAGSFFDISPDAQNYYKTWFIGPGARHKKRVGNGRLQLCGRYGPLIKACWLDKVVVEKGHDEFFIDGTQAMPDVEYIHAVEQVRKYSEFVGRGHHIEEYACMPTILPPVQFVKRVFCHPGTWGHEYMEIKVHKFINVYWIQKWEREPMPGNMSKAHDGRDEADEQQHSGASNKYTLIRIALAMHKRSTTMFSQAGKEPCMYKEFVWYQAIHCHNQWCMQNDVHKHWFSREEYCNTSTASDVYGGTLIYEFFPFFVFQVHRIATAMFGHDIIQRIRPWVEQNYESIAWKDGLLQSCREDWTDSSCVEHHVQFPYKILWTHPNKCTECVQESFVFLVVPMFMLTCPVDFHMWDKRDLYAFDYLGSCRHLYRDYSNIPDHVMYWIVFRSWSKSDQPNHTSIDFYQTKNCAYAMHACFFETGFATVSAHIYILKRCQYWFMKQYKWMCMAPFTFTRNIRPDHRCPAGHMHQEPSYVVHYPNKNEGFWYFEFGQYRSCGQISRWYMCISKLISECPISEWLWLCVFAYNVDMVYCMRLWYKGYMVVFLDADRIWVHDVYRLPHKFVLEKEAVPAHNMEWVYCAGEKLMTYHGQKQLMELLRDVQRWACTFWFLMLWVWWMHYRFSYILNRVVKVTTWDHDTSE','GCHHHSLHDWQCEHRHIIFAYWQSNFIDGFAVRIFPNWRNEMYEVMVDAPPSIYIRNIEGCGSKYMHWLTQGLKDFKWHSFYSHPGCKCTWENGSGQWTHNLCASCHKFCWEAEQLSEWPKRIEAHACVYFVADAVKPQTTQSFENHMIAYGLMAKQKMDTAFEMKFPPNLVATMATNMGFMLMAHYFKDMTCYYAALSRDKDGYGTMRLCPQPWKPIFHNMLKDLITPSNHMMPGLEMTCMMQHEDDWDCLDIWMVPMHNNHKQAPMNGTAYPGTADANEQHDSAMPLEAVIDFAKCWTGQVFMYDEDESEVFFTASFHVGSRPEATRHYVQFYPHIDAVTERLLIRVGMSWHRRCCEGASKRWCMLMMYPQSEMVSKECMHWQRTPLHYGNPLSWPCKRQWEDMFQDGGVIFIDLRDDAGAKKACCCSNCWKTRKVLEMSWQQVFDKTYQTDFYPETNFKTLVDRNLPWERCKECNDYTQAAYAGSMCPHQAIIAAFQWHEFKIKDPIIIRGSWGYYWAFGEQRQLRPKEQWYEYGTHAGCHFYDHGRMNYFKHLHVGHCHASYWLTSEHKKEEMHRVHILMYDRHHNNLSYQHRHWVYIAYNALIGWPDSALFGWWQDQHFKDTGGMNCRHSSVITYMLTPILAVYRKVLPHVWKSDAYDYDVEVGFTVIYHYAVSWSCGQTWKDTCDMAEVPIRQKHYLTNIAMTGMLTHVRSEENCPAGQYNFRRQCKKDINHLIHYEAYHMENRDGRYTYVTNPMTFHHFLYHWTRWYDNSPHIYAFRSTSISLRCKSEVWCLKEHILWMASGCVCMGLGMGWACHESRPQTGWQMVFSLIRLTIKNLKEKVVEQFQVFNDHRPHGQCDWLDADMWQWTIVWTVCHAAWMGFHERKPVKLWFPLQPFDQERIHSHHYQIRHLFTKPQWRYLDCQEKVYTHRHYWVHAPENKVSLTRKAVPVHTVCQKEGRDMRKKLAPNSLHDDYFQEWWFRRRWSMQAFLQYQGRFWFNPGIFWNSMWIHIGPMPVDHNWNSVRMSHKIRGEYQQWWICEFWWMHHYHMISFLRRVWCRSCFIHGYMGLERQANDSVDIMHPMDCLKWKHAWQNDCISRNMWFCWSSFIATSTRYGGVMFCQCGQREKYRWLWNFNTILWENCGFWDNQEMPDDGHNHSEAKVRQLDAGGMNMGNCMSPDQWRTSCSSSIAHRQTYKTQWGMMPLISFVAVMKRGYNVVGRSWKVCCEAKLACSYWVPHCRQMYQWCTKRMGWDKWIPKPPEICDREECCHNMEARFPSYFYMNDLPIPHDWYKDEQSSSNIHEFYYEMWFMSHEHHQINSTCHEHRYQSWQCWCMVGFGPFDKRMIYWIYRLQADKELTKQTIMFYTAPHHPRWSMMPFDENVFWPYRAAEQTYGEPVMRAECHLSTNEVEQRVLNNLESQYLCDCWFPQGVYTRDPMLHDMSVIPPMNRESLIYMVSLQMCQVADAGDWIHHYVDENQPWWHKLRGWIQFSFQFQVWNFCTVPPQQTHTCWRQIDDMWKEPPICKKNVGENSYMRLIATGLQKLWVLQKHDCMHWAIMPDTNIMWGSPKTIEQGLLAEHNEIYDAWGYGEHWQWGDKIWLWKGPDQNRQGLPPRHFTHMWWYSNSDNQFNYETAGHFWYFNSMNVFEVDHKGAKMSYWGYMDCIAHREMWWPYKGVLQAKTTLRQYGREWNQENKLNVQRVWFPIKNNQCMATCGENEHPAHEVYSASMYMTKTELWLPNRMRHRKYFPTLYSRMFNMWVLIDRCEWWSYSHGFVIHVIVPQTQTQLDDLGGIIEPMRLKSFSNWPHAKCWRGYRWIALYGIQCHVFDMAWWHPCYIGGVRIVSHTFVWDHYLIWIDRMPVGQIEREKAFQYRFFNYAWHNFQHSCNNSTEHLSSLGGGFSGSSFQKDVIDGFTWLHVFLDRAAENVWCPDSLRGYKWRSKPIKDQNGTTWSKWNRDWQVMFKDVTNFTATLVEFMETHVTNVDGWPRHEEHWNNKGMTWEAHMKEQQCAPDNLKWTWSPDQKTGWSQLKMNCQDPMSFMSLFAMLKWWQFHLQTEFSDKDHDGLGFTYPLVIYCPTERKMKPGVIIQPKFPVPKQIHRNRDGSWIGGAQGAKVRPFPAPEQESHNVTSNCRQDRLPDRADEKEMHCSDHCCYDILGAHYQEPQQDRNYGARLEFDRPHVCLFYFEEDMSYTHHCGKDIKCGLGIQECEVIQSVEMINAWKSAMWIDITMNFFCCWDPLYGCEGSMGHEPIKLVPCADTMGGMCNKWSNAEQKGPNVHHKRCFWKYLHSWKCRAQQHWVWVFYMHIELAEWDLQCIRTIIVYTKRHFKRAPQPGSLKTKEEGKEENGMSEALQNEVLRHPFYPRCNCPWSDYLVTGCTDMIENQFAAYRHGKMCVHFFNKSLNLPGKFYKKFTVDSRPWEGLLTWKGPMSMLGTLKHGWGSMAQHWKDCLFLTGSKQMWTPDHKYPCRTPVMYEYWHSDIMKWYDTCPLEAENNVHCCWHATIEYPMIYWDTEGAGWEVHDKRIGPIWPNCETMVPMGDSYCSWHVRTRWPRDPEMFFVFMLQPDPPNMCMGQPNMIPCLQPGQDVMFADMGISTKGSEDRWKKTRMYWNMNPWDCYMTHFYMSRQPYWKNRPQDPWDQKWNLHCNKTTEGNFHPVARPDTSDKTLNAMQPHLYCFLKVPMHVVGEHPAGPMMITAMWFKVPTYHQHNDTLRSTSHQPEVFATISCTCIYDQ', PAM250())

In [None]:
print('\n'.join(result))

3577
GRHQGDSAKGLWYC----LMF-YIWQW-LATHYLEHDMLNAFCAAKFHKTPSEDRSRF-HPVTG-NMSHKQS-DDGHCSYDGSPHQYWMH--CM-S-QAAEQRRAQHN-PWASHPL-WE-RHFKHWYK-LMYQKS-Y-SADA-R-HCG-PYDANTAHSHHELTIHHVETQCQLR-APERKGAKNDSRH--L-GREWHDLTPW-QH-PRNR--Y--MELC-RA-N--FC----D-VCH-QQIM-GL-MLCVL-F---WTCFTKYH-SRKIVNWGSRMKGN-CDGSIEMPGVTDW-VAAATCMQ----WH-Q-FKHNDDNFAHWYIALWGFMLCEKP-AVW--GGLFKH-EC----MYI-VPYTFVMDP-QQKDKCWCIKPI-QHMQMLNC-CYN-HQEEMREGVPIFYWL-T--WGGAEFANSNGGAVMYI--WQKEPASLFCLREEVEVWQFDDFLFQNWELQIFGGQVRKCNWLYTSA-VPTMI-HGM-WYADVKEYRYVPFGETGYK-P-CHKQGNMN--RRPPNRCCIKKGKCVGGKPTTEMFSGWEF---RHIIFT-Q-VGYPILNMSPF--HSPVAW-RSQAVCMSPKECW-NGIQLSC-VQNIEVAEFYEWQ-AK--YD----AFVMY-PMGDW--CFLIATWHHKPWTATTSI-YRHPR-WAC--EPTKDGF---LA-GWDINMY-IQAH-DYKKQ-RFDDRWAIERSVRTVFNCE-PAC-F-DLAF-T-LTFKCDL-HFK-H-KCKNPQL-IW-Q-KNDAA-LI-FWNYINHEWMEWRY-YMT-PLGY--W---WTRWYDNSPLIRAFRSIDPSTTSISLRCKFRHKSAWEVWCLKEHILWARYNMFGWRDCMGLGMGWACEESRPQTGWVMVQSLIRLTIKNLKEGEKVVEQFQVLQNLEFKDVFNDHMPHGQCDWLDADMMQWT-------A-WLGFEENKPVKWWFPLQLWDLNLGPWDQERIHSHHYQIRNFLMVHLFTKPQW