### RegEx for Protein lookup

In [12]:
aminoacid_chain = "MAYIKDALARHELEIAK_FYAKRKMAWVAVANRVVGMLKQYPDTKATYEGLFLMQEAY_EKMGLTALANDTQKIIDANKDKTFAPIEKPNEPDLKVPA_VK"

In [13]:
from re import search, findall

In [3]:
def regex_longest_protein(seq, regex="M.*?_"):
    matches = findall(regex, seq)
    return max(matches) if matches else None

In [4]:
def regex_longest_protein_range(seq, size): # size = tuple(min size, max size)
    matches = findall("M[^_]{%d,%d}_" % size, seq)
    return max(matches) if matches else None

In [5]:
regex_longest_protein(aminoacid_chain)

'MGLTALANDTQKIIDANKDKTFAPIEKPNEPDLKVPA_'

In [6]:
regex_longest_protein_range(aminoacid_chain, (5, 25))

'MLKQYPDTKATYEGLFLMQEAY_'

### Prosite to RegEx
[PROSITE](https://prosite.expasy.org/) consists of documentation entries describing protein domains, families and functional sites as well as associated patterns and profiles to identify them

In [7]:
prosite = "C-x-H-x-[LIVMFY]-C-x(2)-C-[LIVMYA](3,4)-{XK}"

In [8]:
from re import sub

In [9]:
def prosite_to_regex(p):
    p = p.replace("-", "").replace("x", ".")
    p = sub(r"{(.*)?}", r"[^\1]", p)
    return sub(r"\((.*?)\)", r"{\1}", p)

In [10]:
prosite_to_regex(prosite)

'C.H.[LIVMFY]C.{2}C[LIVMYA]{3,4}[^XK]'

In [11]:
print(regex_longest_protein(aminoacid_chain, prosite_to_regex(prosite)))

None
