<a href="https://colab.research.google.com/github/molsonkiko/align_isoforms/blob/main/align_isoforms.ipynb" target="_parent">
    <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
!pip install biopython
# lib libraries
import time
# 3rd-party libraries
import Bio
from Bio import Align
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import requests



For discussion of isoforms, see https://en.wikipedia.org/wiki/Protein_isoform

In [2]:
# see https://rest.uniprot.org/docs/#/
BASE_QUERY = "https://rest.uniprot.org/uniprotkb/search?query=accession%3D"
def get_protein(acc_num: str) -> dict:
    '''Get the information in UniProt associated with accession number acc_num.
    API documentation: https://rest.uniprot.org/docs/#/uniprotkb/searchCursor
    '''
    resp = requests.get(BASE_QUERY + acc_num)
    resp.raise_for_status()
    return resp.json()['results']

In [3]:
def get_sequence(prot: dict) -> str:
    '''prot: JSON from the UniProt API for a protein
    Returns: the protein's sequence as a string
    '''
    return prot[0]['sequence']['value']

In [4]:
def get_isoform_ids(prot: dict) -> list:
    '''prot: JSON from the UniProt API for a protein
    Returns: a list of the UniProt accession nums for all isoforms
    '''
    comments = prot[0]['comments']
    out = set()
    for comment in comments:
        isoforms = comment.get('isoforms')
        if not isoforms:
            continue
        for isoform in isoforms:
            iso_ids = isoform.get('isoformIds')
            if not iso_ids:
                continue
            for iso_id in iso_ids:
                out.add(iso_id)
    return sorted(out)

In [5]:
def get_isoforms(prot: dict) -> dict:
    '''prot: JSON from the UniProt API for a protein
    Returns: A mapping of UniProt accession nums to the UniProt API JSON
    for all isoforms of the protein
    '''
    iso_ids = get_isoform_ids(prot)
    seqs = {}
    for id_ in iso_ids:
        try:
            seqs[id_] = get_protein(id_)
        except:
            continue
    return seqs

In [6]:
def get_all_prots(acc_num: str) -> dict:
    '''acc_num: The UniProt accession number of a protein
    Returns: a dict mapping the UniProt accession numbers for all distinct
    isoforms of a protein to the UniProt API JSON for that isoform.
    '''
    prot = get_protein(acc_num)
    prots = {acc_num: prot}
    seq = get_sequence(prot)
    isos = get_isoforms(prot)
    # remove the isoforms with the same sequence as the base acc num
    for iso_acc_num, iso in list(isos.items()):
        iso_seq = get_sequence(iso)
        if iso_acc_num != acc_num and iso_seq == seq:
            del isos[iso_acc_num]
    return {**prots, **isos}

In [7]:
claudin = 'P56856'
all_prots = get_all_prots(claudin)

In [8]:
def get_all_seqs(prots: dict) -> dict:
    '''prots: a dict mapping accession numbers to UniProt API JSON
    (returned by get_all_prots).
    Returns: a dict mapping those accession numbers to the sequences
    '''
    return {k: get_sequence(v) for k, v in prots.items()}

In [9]:
seqs = get_all_seqs(all_prots)
seq1, seq2 = seqs.values()

In [10]:
def to_fasta(seqs: dict) -> str:
    '''seqs: an {accession number -> UniProt API JSON} dict returned by
    get_all_prots.
    Returns: The accession numbers and sequences of those proteins in FASTA
    format.
    '''
    fasta = ''
    for acc_num, seq in seqs.items():
        seqrec = SeqRecord(Seq(seq))
        seqrec.id = acc_num
        fasta += seqrec.format('fasta')
    return fasta

In [11]:
# https://rest.uniprot.org/beta/docs/
WEBSITE_API = "https://rest.uniprot.org/beta"
def request_multi_alignment(seqs: dict) -> str:
    '''Send a request to the European Bioinformatics Institute
    for multiple alignment of several sequences
    '''
    fasta = to_fasta(seqs)
    r = requests.post(
        "https://www.ebi.ac.uk/Tools/services/rest/clustalo/run", 
        data={
            "email": "lestimpe@gmail.com",
            "iterations": 1,
            "outfmt": "clustal_num",
            "order": "aligned",
            "sequence": fasta
           }
    )
    r.raise_for_status()
    job_id = r.text
    job_status = 'RUNNING'
    # ping the server every few seconds to see if the job is done
    while job_status == 'RUNNING':
        time.sleep(4)
        job_status_req = requests.get(
            f"https://www.ebi.ac.uk/Tools/services/rest/clustalo/status/{job_id}")
        job_status_req.raise_for_status()
        job_status = job_status_req.text
    # now that the job is done, get the alignment
    resp = requests.get(
        f"https://www.ebi.ac.uk/Tools/services/rest/clustalo/result/{job_id}/aln-clustal_num")
    resp.raise_for_status()
    return resp.text

In [12]:
def align_isoforms(acc_num: str, **kwargs) -> str:
    '''get all isoforms of the protein with accession number acc_num,
    and return a sequence alignment.
    Can also pass in keyword arguments to set parameters for the alignment,
    if there are only two isoforms to align'''
    prots = get_all_prots(acc_num)
    seqs = get_all_seqs(prots)
    if len(seqs) < 2:
        print(f"The protein with UniProt accession number {acc_num} has only one isoform")
        (acc1, seq1) = list(seqs.items())[0]
        return seq1
    return request_multi_alignment(seqs)

In [13]:
# test aligning something with two isoforms
alignment = align_isoforms(claudin)
print(alignment)

CLUSTAL O(1.2.4) multiple sequence alignment


P56856        MSTTTCQVVAFLLSILGLAGCIAATGMDMWSTQDLYDNPVTSVFQYEGLWRSCVRQSSGF	60
P56856-2      MAVTACQGLGFVVSLIGIAGIIAATCMDQWSTQDLYNNPVTAVFNYQGLWRSCVRESSGF	60
              *:.*:** :.*::*::*:** **** ** *******:****:**:*:********:****

P56856        TECRPYFTILGLPAMLQAVRALMIVGIVLGAIGLLVSIFALKCIRIGSMEDSAKANMTLT	120
P56856-2      TECRGYFTLLGLPAMLQAVRALMIVGIVLGAIGLLVSIFALKCIRIGSMEDSAKANMTLT	120
              **** ***:***************************************************

P56856        SGIMFIVSGLCAIAGVSVFANMLVTNFWMSTANMYTGMGGMVQTVQTRYTFGAALFVGWV	180
P56856-2      SGIMFIVSGLCAIAGVSVFANMLVTNFWMSTANMYTGMGGMVQTVQTRYTFGAALFVGWV	180
              ************************************************************

P56856        AGGLTLIGGVMMCIACRGLAPEETNYKAVSYHASGHSVAYKPGGFKASTGFGSNTKNKKI	240
P56856-2      AGGLTLIGGVMMCIACRGLAPEETNYKAVSYHASGHSVAYKPGGFKASTGFGSNTKNKKI	240
              ************************************************************

P56856        YDGGA

In [14]:
# test aligning things with >= 3 isoforms
ampk_gamma = 'P54619-2' # https://en.wikipedia.org/wiki/PRKAG1
align_ampk = align_isoforms(ampk_gamma)
print(align_ampk)

CLUSTAL O(1.2.4) multiple sequence alignment


P54619-2      --------------------------------MKSHRCYDLIPTSSKLVVFDTSLQVKKA	28
P54619-1      METVISSDSSPAVENEHPQETPESNNSVYTSFMKSHRCYDLIPTSSKLVVFDTSLQVKKA	60
P54619-3      METVISSDSSPAVENEHPQETPESNNSVYTSFMKSHRCYDLIPTSSKLVVFDTSLQVKKA	60
                                              ****************************

P54619-2      FFALVTNGVRAAPLWDSKKQSFV---------GMLTITDFINILHRYYKSALVQIYELEE	79
P54619-1      FFALVTNGVRAAPLWDSKKQSFV---------GMLTITDFINILHRYYKSALVQIYELEE	111
P54619-3      FFALVTNGVRAAPLWDSKKQSFVVLRALSCPLGMLTITDFINILHRYYKSALVQIYELEE	120
              ***********************         ****************************

P54619-2      HKIETWREVYLQDSFKPLVCISPNASLFDAVSSLIRNKIHRLPVIDPESGNTLYILTHKR	139
P54619-1      HKIETWREVYLQDSFKPLVCISPNASLFDAVSSLIRNKIHRLPVIDPESGNTLYILTHKR	171
P54619-3      HKIETWREVYLQDSFKPLVCISPNASLFDAVSSLIRNKIHRLPVIDPESGNTLYILTHKR	180
              ************************************************************

P54619-2      ILKF

In [15]:
# test aligning things with only one isoform
ckb = 'P12277' # https://en.wikipedia.org/wiki/CKB_(gene)
align_ckb = align_isoforms(ckb)
print(align_ckb)
has1 = 'Q92839' # https://en.wikipedia.org/wiki/HAS1
align_has1 = align_isoforms(has1)

The protein with UniProt accession number P12277 has only one isoform
MPFSNSHNALKLRFPAEDEFPDLSAHNNHMAKVLTPELYAELRAKSTPSGFTLDDVIQTGVDNPGHPYIMTVGCVAGDEESYEVFKDLFDPIIEDRHGGYKPSDEHKTDLNPDNLQGGDDLDPNYVLSSRVRTGRSIRGFCLPPHCSRGERRAIEKLAVEALSSLDGDLAGRYYALKSMTEAEQQQLIDDHFLFDKPVSPLLLASGMARDWPDARGIWHNDNKTFLVWVNEEDHLRVISMQKGGNMKEVFTRFCTGLTQIETLFKSKDYEFMWNPHLGYILTCPSNLGTGLRAGVHIKLPNLGKHEKFSEVLKRLRLQKRGTGGVDTAAVGGVFDVSNADRLGFSEVELVQMVVDGVKLLIEMEQRLEQGQAIDDLMPAQK
The protein with UniProt accession number Q92839 has only one isoform


In [16]:
frmd3 = 'A2A2Y4-4' # example with 5 isoforms
align_frmd3 = align_isoforms(frmd3)
print(align_frmd3)

CLUSTAL O(1.2.4) multiple sequence alignment


A2A2Y4-4      ------------------------------------------------------------	0
A2A2Y4-3      ------------------------------------------------------------	0
A2A2Y4-5      --------------------------------------------MQLSKRETKGQFLIDH	16
A2A2Y4-1      MFASCHCVPRGRRTMKMIHFRSSSVKSLSQEMRCTIRLLDDSEISCHIQRETKGQFLIDH	60
A2A2Y4-2      MFASCHCVPRGRRTMKMIHFRSSSVKSLSQEMRCTIRLLDDSEISCHIQRETKGQFLIDH	60
                                                                          

A2A2Y4-4      ------------------------------------------------------------	0
A2A2Y4-3      ------------------------------------------------------------	0
A2A2Y4-5      ICNYYSLLEKDYFGIRYVDPEKQRHWLEPNKSIFKQMKTHPPYTMCFRVKFYPHEPLKIK	76
A2A2Y4-1      ICNYYSLLEKDYFGIRYVDPEKQRHWLEPNKSIFKQMKTHPPYTMCFRVKFYPHEPLKIK	120
A2A2Y4-2      ICNYYSLLEKDYFGIRYVDPEKQRHWLEPNKSIFKQMKTHPPYTMCFRVKFYPHEPLKIK	120
                                                                          

A2A2Y4-4      ---------