## BA5E

**Global Alignment Problem**

Find the highest-scoring alignment between two strings using a scoring matrix.

Given: Two amino acid strings.

Return: The maximum alignment score of these strings followed by an alignment achieving this maximum score. Use the BLOSUM62 scoring matrix and indel penalty σ = 5. (If multiple alignments achieving the maximum score exist, you may return any one.)

Link: https://rosalind.info/problems/ba5e/

In [None]:
import numpy as np

In [None]:
class BLOSUM62(object):

    def __init__(self):
        # u obliku dictionarya, ne matrice
        with open('/content/BLOSUM62.txt') as input_data:
            items = [line.strip().split() for line in input_data.readlines()]
            self.scoring_matrix = {(item[0], item[1]):int(item[2]) for item in items}

    def __getitem__(self, pair):

        # dobije se par npr. C i T i onda se u matrici trazi score 
        return self.scoring_matrix[pair[0], pair[1]]

In [None]:
def InsertIndel(word,i):
  return word[:i] + '-' + word[i:]

In [None]:
def GlobalAlignment(v, w, scoring_matrix):

    sigma = 5
    n = len(v)
    m = len(w)

    S = np.zeros([n+1, m+1],dtype = int)
    backtrack = np.zeros([n+1, m+1],dtype = int)

    # popunjavamo rubove kaznama
    for i in range(1, n+1):
      S[i][0] = i*(-sigma)
    for j in range(1, m+1):
      S[0][j] = j*(-sigma)


    # popunjavamo S i backtracking
    for i in range(1, n+1):
      for j in range(1, m+1):
        scores = [S[i-1][j] - sigma, S[i][j-1] - sigma, S[i-1][j-1] + scoring_matrix[v[i-1], w[j-1]]]
        S[i][j] = max(scores)
        # pohranjujemo koju smo vrijednost uzeli kao max
        # ako smo uzeli od gore - pohranjujemo 0, s lijeva - pohranjujemo 1, dijagonalno - pohranjujemo 2
        backtrack[i][j] = scores.index(S[i][j])


    # High score
    max_score = str(S[n][m])

    # počinjemo na high scoreu, odnosno na zadnjem elemementu [n][m]
    while n*m != 0:
      if backtrack[n][m] == 0: # deletion
        n -= 1
        w = InsertIndel(w, m)
      elif backtrack[n][m] == 1: # insertion
        m -= 1
        v = InsertIndel(v, n)
      else: # match ili mismatch
        n -= 1
        m -= 1


    # vraca aligned v i w
    return max_score, v, w

In [None]:
# sample dataset
GlobalAlignment('PLEASANTLY','MEANLY',BLOSUM62())

('8', 'PLEASANTLY', 'MEA--N-LY')

In [None]:
# dataset
result= GlobalAlignment('NRLSFNREFKCSILNWMATQCAQPTHSIIDYMRVPYMQQGHHAWFTFEEGVDKEVGHRPTCQEREARRYSIDNNPKHMAITGPVMNNHSLWRDPFGRWKKAGCTYYTYCHPPMMHPKTHYSWYRQFWSQLILESEAVFGAAGKGPRSMNAPNGEKPHLNYGKKTRPNQCETVAHIFGPQRPLNATQWVKLRRYGWNPVVPAQTLVHVEDHFFVQWGVQMVNKHSEDPYGIGKIAFQYRMSHNRHRLAEQWIRISTIIIKAIEGKDLALAHNDCHQFWHAAEMILDAAQKNHWNIDPAQIKNCSHKMCQGYCRPCCLRADLKHCLHTKNGGIGCKRRFDIERQNRAGYEPMGHGWTRDLIWDKWPFKRQTANPMIYLSNQDKSNNDSSRVVFMYQMNTMPCQIAPHQDRRNCMITICVGPFSWWKYQQQWRIWSAWTDDHWDTCNMMPWPLCDMYHYLDEHHNSWAFRMYMMCDMMKPGECMDIVYVIQNVTMSSCAPWKDCTVFSRAFRSMLDMILIPKQITIILIWNMMNGCNCMIDWAFQQRYYWLSRKLHVHERFFWRMRRYNDCYTMGNCIWDMKCRKTWPYCHYLRRSFRSMTRQYNTITSTVHLESRHLDRQMIDHYYDYCEMFDTQRVFCSMCVNTSYNCAEWFPFEIHGNYQFQKQSIEHCCFNISPAHIEHNQKPLQNQNAPNSFTADDFSRECPKDFELDSLKDSWEAWLCCSTPHKPDCGFAIAQESMHFRGAINGEESRILAHGWWWDKMEAHLFWSECIMGWNLLITRRRRMNKAKVKECQTFLWGKNFSRRAMYANANYIVYTPVPCTWRARCWPFHYMAEALDEAQFLNRWNLCCCCGNRDQRTPCPGQIWQMLGNCIPYWWYRNHGQMSRLARRHHKLRSLNGQTQHTRCVASHTNCAELTSLDKAQVLSTLDMQVNWREEEPITLNAVQACDYMPPSAMWICYQIKWKQFCSWSSCKRAFTPNLMSIARRLGEAWASFDWSDDKCNDMFFIFPYIHAMASRTKWRHRKFLGHESRAFMYGDDMNLQKTIETSLLFTHDPNANTDGLQMDDTCARAQQAHCKSYKYDNHPIGSTRLTMKRETIQQTRWYNEDLMIMRQDDVAKAYKCTIQYLHGRSCCFNALRCTFVNVAGPQDQRSRHKPNAKIPDHHRKDCEWTEMGRQPHYSDHVYRQHYKTVSYWNGECTVNPWPFVTYYHYYNTQRSAWPWGPPVCIEMITVRQWPHGKLSKGDCLNAKKTEAEPKVYACVKGTADPCQMIYPFVILTGHRCSIWIMGVMVAMYQSLMRHWAIIVHDIRDHPVQKGDVPVKVRAHWHPPVVFGMYEVNMCDGHYKRQSPWSVNCGMYWDTRARCPPENIIAPMNTSNMTGDEKQMYHGYKTMEEQIMMEQDRRLAWQEKFQPGSPMEYFTDCFTCGKAHFVYWFMMRELMDQRSDHKKAGITFYICDCGARNSASRMLEHRARDDRHRTPVREMELLDIERQKLFMDDTTAFAWFGWIRKTCQMPMDNCQKYWRVAVDRPIGCVDDACIHVTATCNCGHALLFNKEAPVDADQNRYIFQHHKCCGMYLKNRQETLNRVFMQWKDCVDGVPKDIWSWDWEGWQTVRGPMQQCCMWSRAINAGCCEAKGAYEGGLCYAHGANCLGQDEREAEEYCLKWSTNMCGMWTWWKKEILKIDDMEETEQNWAANFLCGSTYSFEWSNETTYHSQKRMARLQYRLFFCCYCEIMFPICSVYLEANEDTPIARLYVKVPAPAHGNYKDDKASWGYCPTNKEYSEFCPDQMGMWTNQGNCCAQENDECSIFKCRIFPKWDSYEVYWQRGQAATWDVIAWTANVALHTCSASWWTYWQDMSMNCAHEKMDELGCAYAAHTRDKLFHWNMIRLFDKCLYTNCLHLALYDLCWDRWPWMRLMSIATKPSGTCLGMQLMCLWCMGEKRLQVNWFTMGRVKFWNLCMAMNGCYQPEFYMFFVFMSFWNIKCQVLDKRRDTVLDEKHVSTHNPSKQVQDQAPMPWRQSNATMKKRWTNAIDCPDDPYDWFYHYYNTYHIWCLITQMDAQINTAERLIAVVAACAMCWTGGKNGHSEWFYGGHGGERTLCYMKYQCESGETRRTMKGLKKWREEGMWENGFCLVSTKTSWELVSVHVTWPKFHVSKHEMDIGEHARRMSKAIWIWKIRFNAFPPHIIMKACKAQHTEMGHWTKSSYVQKYWVAYIELHWDEAPNEVMIRCMMPSRAMSFCLFLVFQQTHEYFSGKTCKRNGEFEKFKNHMLEGTRSCHVFPKQMMPPQKMMMIPNIRWSERSNNAHFCKISYCGPGVPWDTQKKCMACKCLTGWCMEMANAECRRFWMNSMWWTPWVGPTNSGSIWLGHQQRFRRGEVHPGRLQDGTEAYQDICRDWENMCFQSEHFMETGEWCVTEGNTWNQFDYWTYPYCWGAGHKCWTHQPLSKYYMDDIWMNFCEEMDWDYQAQNQAYHWKPSYRRAETTAIDQHNVKNISPLFKCSLTRMWQPDSFWYWAWVDPTSLEGSQRECQVIWHIPTWNVQWHAWYHCHYGRKDFGAEGISAHCEIVVLNWYFVACIEHDETMELYVNHMFRIIPMNCNDHKWDWIWKYHSMFNQFDAMNWLLVTMQAATHWQWGIHDGMFRWKDKALDTTCIFGQKVIWQEPHWWTEPNMMIPPGWIHNLNKLMNDEAWPIGYDMNVLIMAWWKMQRGFLNGCWGHLLTDNVCIWYRAWPCYHKCQIADTMAWSKFDKANQTIVGTICVGDLHCESFYCQLQSLYNCHPHWCMRKIPSNPDIHRFSVSSFTECEFGLYTDPSDVWRGRCEPVWNVKMYTVTCDKTCDPQYQMLPQARGKSSLVYTYIPTPTHKFYKHFLIEETAIFKDLRDAPKNVPYWNKEVQRPYPCNAWKTCRIRTQFSMNCDIIRTQTENATHNYMFIGLMIQFDIAKPTHEQPSVSLWARQTLWVWACKDMYITPCYCKDWGYPLILQFKIPNSRGICSYTNFDYDKEKPSYWEPHSADTSYQKVQASWWQKEYCCIYRDWEWRLMNYNWSWPLQNQEGGQIYRRGAHQMRHLQQWNYYSLMGIQYEWNMRPMNDWSMIQAEIVMKPLFFHDKCCTRERSWQISIDPMDMLEIEKLLAFKRQIIFKNWDMHCGVNQNVQWYYFFYYALEMALNFYSIWCFSFLEQDCMKDLSKHLKCNSVHRGNKCIYFMDIVNEANTQKTYEPQVVEIGNSNCGVAKPPVDPQQNQWCNQCWRCQHVMVGWMCRGVVGEPQSRICYYQAVQCKMFRIPDTMNMNKAMFKQGAKDHEGCQKGCFMDSPWFYKNGHLSANNHDIRKKTDYVVHEQDQCIRYWLRVLVYCICGDPLARKCCQYNENWNKLWQGHASGNKKNYWNDWQCMNQDFSNTVRVNINPTMRSAKDLTSRLKPYQDMLLLPMESEMEFIRADKDCSPMSGLWMAVQMECVWKTSLRATATGGLAMHCFLVKFYCDNWRHYQDKHCLIAMCQVNFIHDDNMASPNVAKEKGRLILLHEVTYRIALGRGGFGNWQCICLYSHRIMEDLPIFCLSPIGYSTASESKIQCMFVFLWCFCWGPFSAYKLFYCSMCVVKAGYHLIWKKTSKNVWLLEVKRKTNCAKTKGHGSQKAFAEWNGIPNCSPPVGHQVHADPYTDQQTCYIEYQGQWMVDIDFVSDFGAVEKFGHQQLEDRLPSCSRQPLWPENYEDRVQLDCMIMWYGAAGKERHEQEEFQKCQKCVRQCSTRTADMFAYHFIIRTFANSEMWNLNCHAKFVRKDRCRVRFLTGMLRGEETMQNGQMVTPNRAFILTNNKCFMMNSCQCYQFNRHATVEDTVGSCELVQADRYPMHWLTCKTQGNMYPIWLMVVGSKGKTCEYTEPHVYDRFKEDFSLLFEVIIWDYHKIWFPKVASCTQSCSTIDNGADVCPSEFEIRPRACDHMAHNVAPSHHEAMQNWMTQTNQFCLNKQDTRMNWPVWKWPYKSRMYRVMGWDSECPTWVCNRRSRALYFHGHPAMSSTLCLSNLLEVYAKERWGIRGRWYANHSMGDFTFWMEQHNPSYYWRLPTDHPKKMCVITWCQWFDLRQGKQPDGRPKHLMGYVEFVPYYLYAGIGLSLKVGVIRD','NRNSSMVIDVHSFNREFKNWMATQCAQPTMSINYYMRGFNLPMFPNWAKERQVEWIPQMGYHAWFTFEVGHRPTCQEREARRYSIDHMANSVKTGPVMNLWRDPFQFWKKAGCTIPAQYTYEHPPMPYRQFWSQLILERMWHSRQREMRHDTPEFLQFGAPRSHWNAPNGEKKHLKSYFPAAWTFQCETVAHIFGPRDRILWVKLRRYGWNPVVPAQTLVHVEDHAFVQWGVQMVNKHSEDEYGIGKIAFLRYCTCCAEQECGTRWECREWPGKGNNGKTLHPPKHMCLAAEMIPIDPAQCSHKMCQGYLRYGKQNDMFRKNGGIGCKIACMRGYGMMEQIERQNRAGYEPKGGWTRDLIWDKWKTSLCPFKRQTANPMIYLSNQTYKKSNNDSSRYVDQLQANTMPCQIAPHQDRRNCMITQCVGPFSWSAWTDDHWDTCNMMPDMYHYLDEHHNSWFCFSGNLFRMYMMGDNMHPGECMDNDNMNVYSKIQNVTMSCCLRGLSTPWKDCTVFKDRPRAFRSMLNWLNGCNCMIDNAQHQYYWLPRKLHVHERFFMWWRMRRYLDCYTMGNCIWDMRCRKTWPYCHYLIRSFRSMTRQYNYNAESYIDIDHYYDHCEYFDTQRVFKCAEWFIHGCGNHINMHCGSPAHCEHNIKPLQNQTAPNHFTADGTRFREKYKQFEWEANSPGGVKLCCSTPHKPYCDFHFRNTRWFAINGEESRILAHGWWWDKMEAHLFWSNCRRRPMKQEKQHNFEKDMNANYITEHKVTWRARCWPGGMPHYMAEALFANRWNLPLNKLEWVEDQRTPCPGNNTEKEMRCSNTGAEQMLGNCIPCWWYRNHGQMSRLARRMGNFEIMTHKLRSLNGQTQHTRCVHSNCAYLTSLDKAQVLSTLDMSVNWWEEEPCYITYTTTLNAVQRMWFFKNCYQIKQKEFKSWSSCTASAFTPNRTWEKMSIARWSISQCADDKCNDMFFIFAYIHAMAIQFQKWAEWQWRHVKLLGHESRAVHLAVHFQMYGDDMNLAKTIEGTCEKQESKSLLFTHDPKGLDTCQMSYEYDNKPNEQLTPIGVVIDHTMWTMQQTLKGWRCHHRSSKDDVAKAHGRSCCGNALRCTFPQDQIDISVEGSRHKPNALRENIPDHHQCQEQTKDCEWTEGMTLGSWHHSHNHSHVYRQHYKTVSYWNRECTVNPWYYNTQRSAWPWGPPVCVTQWPHGKLSDGDCLNAKKTFMNWAGLQTPKVYWITDYDDGRPCQMILPFVILTGHRCSCWIMGVMVAMYVGHWSLWACESDHRDHPVQKGDVPVKVRAHWHPPVVFGMYEVNMLNTFKKNFDDGHYKRQSMWSVNHGMYWDTRRKLADCAKRCPPEIAPMQMYHGKTMEEQIMMEQDRRLAWQEKFPHWMTGPGSGSSMREGKAHFVYWFNMKTVQAEELMDQRSDYKKTGSFFLTGCDCFASRMLERARHRTPVREMELYDKDDIESQKLFMNDTTAFAWFWHWIRSELLEVVTCQPMANYLREEPHHQSKYMRVAVDRPIGCVDDACIRSCPLNPFGHALLMLVEAPVSADFQHHKCCVMNNHNRSRPTLKNRQEALNRVFMQQVWHFIHGIDGHCWEGWQTVRGLMQQCCMCSRAINWGCAEEGGLCYAHGGQDEREAEEYCLKLSTNMCGMWEETEQNWAARFLCYYSYMICEKFVPTNGVRTREIVYGENEYPIYDNLVAKYHSQYRLFFCCYYEIGCDWFCLKAFPICSKYLEANEDTPIARLYVKHHLRWNYKADKASWGYFFTNKEYMGMWTNQVNCCAQENDHCSIFKCRIFPVQKYPYDSYVYWQRGQAATWDVIAWTANAAGKHSSCSASRAYYCTTYWHPWLAVDMSMNHEKMDELGCCIFDEYVHIPAAHTRDKLFGWDKILISLPARAYTNCLHLALYDLCWDRWPWRRGYRSIATVPRAGTCLGMQLMCLWCMGEKRRQVNWFTMKRPNKVKFWNVCEQHTPMIEKHLEEFANDDANQNRDFFVFMSFWNIKCHWRACHTDLVLKKEHEKHPSTHNPSKQVDLNADQAPMPWRQEPYNTNDNATMKRWMNAIKLRNCCCFRNRFYPDDQYDAEIMKCPKYIWPMERLIFVDRWCCEVVAQHRRDRCAMCWGHSCCQDQSKRALGGCLPNYIRRKDCLSSKYQCESGETRRTMKGLKKWRCEGMWENGNYLVEHKTSWELVSTWPKFHVMCHEMALKPHVDSIIGERGRRCSKAIWIWKIRFNAFPPHNHHFRFGQNMFACKAQHHKSSYVQQLFYAAYWIYQPRPYMGSMNAYIELHWDEAPNEVMIRRALSFCNFLVFQQTHEYFSGKICKRNGEFEKFKNHMPKNMMMMILGLAMWIRWTERSNNAHFCKIVKCGVYHICGVPWDTQKKCLWCIGLTGWCMEMAQAECRRFWMNSMDRCSPSYWTPWVGPTNRWRRGSIWHGHQQGWRRGEVHPILHNVELAQDICRDWENMCFQSTTELVHMNIWNQFSPPYCWGAGHKPWTHQPLSTGYEYVMDWMNFCEEMDWDYRFDYLQCQNQHSKSYHWKPDYRRAETTAIDLFFWWTPHSFWYWAWVDPCRFVANFIVSLTWGFVSGKWVIWHIPTWNVQWHAWYHCALNWYFVACIEHVENHPMNCNDHKWDHIFPMTWKYHSAFNQVDAMLCGFRFCTWMFANSHIDACHNGPTPIGMFRWKDKALDGSYFYMFWAHTEPNMMIPLGWIHNLNKDYAGMNDEGYDMNCLAWWKMQRGFLGNPCEHKCQIADYMAWSKEHYETHVGTICVGDLHCESLYNCHPDWCMRKIPSNPDGERAHRFSHMEEANGCHYSFTVKCEFKPAVCHYKCLYTDVSDEPVTCDKTCARGKSSLVTHGFYKHFLIEETFYERKLFKDSNNDGWRCRDAMPEDWKECNPIAPIKTLEVQRPYPSNAWKTCRIRDEQFSMNCDQITENAYHNYMFIGLMIYFVIAKPGYLRQRVYMASWLYARQTLWVMACGSRQDMYITPCYILQRGINIHEPYMWLPYDYDKEKYSYYIQQVLAEPHSADSSYASWMQKEKRDWENRLMNYCWSWWLQNQEGGFIYRRGAHQMRHQQQCCMFFFCLNYYSLMGIQYEWNMRPMNDWCIRLSYIDKCCTRERSWQISIDPMDMLEIEKLLASPLKRQIIFKDWDMHCQNHCNLYNQENQNVQWYYHEIRKIKWHFYYALEMALNFYSHQDCMKDLSISNKHLKCTSVHRGNKCIYFMDWVNEANTQGIDDHEIGNSNCGVAQGNPQQNQWCDQCWRCQHNMIGWMTRGVFGEQQSICRMSTMNMNHAMLKQGAKVCFMDPWFCHVCFKNGHLSAGNHDHRKPTEHYVVHEQDQCIRYWLRVLVYCICGDPLARKCCQYNENWNKLWQGHASGNKKNYWNMFWMMQDFSNTVRVNINPTMRSAKDLTSRLKPYQDMLLFPMESEMEFIRADKDASPMKGLWMAVQCECVTSLRAWLVGGSNNYINFAMHMMFAFHWWEGAFYCDNWMHYQDKCLMNDFYSAMCQVNFIHCDNMAPPNVMCEKGMLDPKCTVMEVRIALGQSVNSQCICLYSHRIMEDLPIFYLSPIGYTTAYDESCIQCKTPNFSATYIWYEVMCVVKAGYHLIWKKTSANVWLLEVKRKDDRDFWIMMNCAATKGHGPQKAFAEWNGIPNCSRAQNIMKYVGSQVHADPYTAFLFQEQGSTLHWMVIIIFVEFGAVEKFGHQQLEDRRQPLWQENCCYEDRVLDCMIMWERHEQEEFQDCQKCSTRTAAYHFIIRTQEHGNVRKDRCRVRFLTGMLRGEITMQNGQMVTPFRAFILNNKCFMMNDHDHLEANVHATVGRPAEPVDTVGSRELVQRYSMHWYPKPSQFGYNQVFSKGKTCEYTDRFKHSIVDFSPDLLFEVIIWDYHKMQFSYSLKVASCSTIDQGADYCPSGNREASIWCEIIPRACDHMATNVAPSHHTNQFCLNKWDTSMNWPVWKWSYKSRMYRVMGWDGECGTWVCNRRSHPAMSLPWTHVWFNLLEVYAGDRWQPWDKTNIRGRWYATFWMQHNPEHPKKMSWFDRHKCVHHHCLTGYVITPKTYFDPYKNYCGIGLSCKVGVIIHMRVDRED', BLOSUM62())

In [None]:
print('\n'.join(result))

10040
NR-LSF--N-REFKCSILNWMATQCAQPTHSIIDYMR---VP-YMQ--QGHHA-WFT---FEEGVDKEVGHRPTCQEREARRYSIDNNPKHMAITGPVMNNHSLWRDPFGRWKKAGCTY---YTYCHPPMMHPKTHYSWY--RQFW-S-QLILESEAV-FGAAGKGPRS-MNAPNGEKPHL-NYGKKTRPNQCETVAHIFGPQRPLNATQWVKLRRYGWNPVVPAQTLVHVEDHFFVQWGVQMVNKHSEDPYGIGKIAF-QYRMSHNRHRLAEQWIRISTIIIKAIEGKDLALAHNDCHQFWHAAEMILDAAQKNHWNIDPAQIKNCSHKMCQGYCRPCCLRADLKHCLHTKNGGIGCK-R--R-F---D-IERQNRAGYEPMGHGWTRDLIWDKW-----PFKRQTANPMIYLSNQD-K-SNNDSSRVVFMYQMNTMPCQIAPHQDRRNCMITICVGPFSWWKYQQQWRIWSAWTDDHWDTCNMMPWPLCDMYHYLDEHHNSW---A---FRMYMMCDMMKPGECMD---I-VYV-IQNVTMSSC----A-PWKDCTVFSRAFRSMLDMILIPKQITIILIWNMMNGCNCMIDWAFQQRYYWLSRKLHVHERFF-W-RMRRYNDCYTMGNCIWDMKCRKTWPYCHYLRRSFRSMTRQYNTITSTVHLESRHLDRQMIDHYYDYCEMFDTQRVFCSMCVNTSYNCAEWFPFEIHGNYQFQKQSIEHCCFNISPAHIEHNQKPLQNQNAPNSFTADDFS-RECPKDFELDSLKDSWEAWLCCSTPHKPDCGFAIAQESMHFRGAINGEESRILAHGWWWDKMEAHLFWSECIMGWNLLITRRRRMNKAKVKECQTFLWGKNFSRRAMYANANYIVYTPVPCTWRARCWP--F-HYMAEAL--DEAQF-LNR--W--NLCCCC-GNRDQRTP-CPGQ-IWQMLGNCIPYWWYRNHGQMSRLARR--H-----HKLRSLNGQTQHTRCVASHTNCAE