In [1]:
# standard library modules
import os, sys, errno, json, ssl, time
from urllib import request
from urllib.error import HTTPError

import pickle

In [2]:
DATA_DIR = '../data/protein'

In [3]:
BASE_URL = "https://www.ebi.ac.uk/interpro/api"
PFAM_INDI_URL_PREFIX = BASE_URL + '/protein/UniProt/entry/pfam/'
PROTEIN_URL_PREFIX = BASE_URL + '/protein/uniprot/'
PFAM_LIST_URL = BASE_URL + '/entry/all/pfam'

PROSITE_LIST_URL = BASE_URL + '/entry/all/prosite'
PROSITE_URL_FORMAT = f'https://prosite.expasy.org/{}.txt' # fill in something starting with 'PS'

In [4]:
def try_get_payload(curr_url, context, data_type='json'):
    last_page = False
    next_url = curr_url
    attempts = 0
    content_type = 'application/json' if data_type == 'json' else 'text/plain'
    while attempts < 3:
        try:
            req = request.Request(curr_url, headers={"Accept": content_type})
            res = request.urlopen(req, context=context)
            # If the API times out due a long running query
            if res.status == 408:
                # wait just over a minute
                time.sleep(61)
                # then continue this loop with the same URL
                return False, curr_url, None
            elif res.status == 204:
                #no data so leave loop
                return True, curr_url, None
            if data_type == 'json':
                payload = json.loads(res.read().decode())
                next_url = payload.get('next')
                if not next_url:
                    last_page = True
                return last_page, next_url, payload
            elif data_type == 'txt':
                return True, None, res.read().decode('utf-8')
            else:
                sys.stderr.write("Unknown data type: " + data_type)
                return last_page, None, None
        
        except HTTPError as e:
            if e.code == 408:
                time.sleep(61)
                continue
            else:
                # If there is a different HTTP error, it wil re-try 3 times before failing
                if attempts < 3:
                    attempts += 1
                    time.sleep(61)
                    continue
                else:
                    sys.stderr.write("LAST URL: " + next_url)
                    raise e
    return last_page, next_url, None

In [5]:
def get_family_full_list(start_url, upper_limit=1000000):
    id_result = []
    
    #disable SSL verification to avoid config issues
    context = ssl._create_unverified_context()

    next_url = start_url
    last_page = False

    attempts = 0
    while not last_page and len(id_result) < upper_limit:
        last_page, next_url, payload = try_get_payload(next_url)
        if payload is None:
            continue
        for i, item in enumerate(payload["results"]):
            id_result.append(item["metadata"]["accession"])
            if len(id_result) % 1000 == 0:
                sys.stdout.write(f'{len(id_result)}-th read: {id_result[-1]}\n')
        # Don't overload the server, give it time before asking for more
        if next_url:
            time.sleep(1)
    return id_result

In [7]:
pfam_fname = 'pfam_ids.pkl'
if os.path.exists(pfam_fname):
    with open(pfam_fname, 'rb') as f:
        id_result = pickle.load(f)
else:
    id_result = get_family_full_list(PFAM_LIST_URL)
    with open(pfam_fname, 'wb') as f:
        pickle.dump(id_result, f)

In [8]:
print(len(id_result))

21979


In [9]:
def get_entry_ids(start_url, entry_set, entry_list, upper_limit=100000):   
    #disable SSL verification to avoid config issues
    context = ssl._create_unverified_context()

    next_url = start_url
    last_page = False

    attempts = 0
    prev_length  = len(entry_list)
    while not last_page and len(entry_set) < upper_limit:
        last_page, next_url, payload = try_get_payload(next_url, context)
        if payload is None:
            continue
        for i, item in enumerate(payload["results"]):
            curr_protein = item["metadata"]["accession"]
            if curr_protein not in entry_set:
                entry_set.add(curr_protein)
                entry_list.append(curr_protein)

        if len(entry_list) - prev_length > 1000:
            sys.stdout.write(f'{len(entry_list)}-th read: {entry_list[-1]}\n')
            prev_length = len(entry_list)
        # Don't overload the server, give it time before asking for more
        if next_url:
            time.sleep(1)

In [9]:
def get_ids(start_url, protein_set, protein_list):   
    #disable SSL verification to avoid config issues
    context = ssl._create_unverified_context()

    next_url = start_url
    last_page = False

    attempts = 0
    prev_length  = len(protein_list)
    while not last_page and len(protein_set) < 100000:
        last_page, next_url, payload = try_get_payload(next_url, context)
        if payload is None:
            continue
        for i, item in enumerate(payload["results"]):
            curr_protein = item["metadata"]["accession"]
            if curr_protein not in protein_set:
                protein_set.add(curr_protein)
                protein_list.append(curr_protein)

        if len(protein_list) - prev_length > 1000:
            sys.stdout.write(f'{len(protein_list)}-th read: {protein_list[-1]}\n')
            prev_length = len(protein_list)
        # Don't overload the server, give it time before asking for more
        if next_url:
            time.sleep(1)

In [None]:
protein_list_fn = 'protein_list'
protein_set_fn = 'protein_set'
if os.path.exists(f'{protein_list_fn}.pkl') and os.path.exists(f'{protein_set_fn}.pkl'):
    with open(f'{protein_set_fn}.pkl', 'rb') as f:
        protein_set = pickle.load(f)
    with open(f'{protein_list_fn}.pkl', 'rb') as f:
        protein_list = pickle.load(f)
else:
    protein_set = set()
    protein_list = []
    for pfam_id in id_result:
        start_url = PFAM_INDI_URL_PREFIX + pfam_id
        get_entry_ids(start_url, protein_set, protein_list)
        with open('protein_list.pkl', 'wb') as f:
            pickle.dump(protein_list, f)
        with open('protein_set.pkl', 'wb') as f:
            pickle.dump(protein_set, f)

1020-th read: A0A060X6M9
2040-th read: A0A075DN36
3060-th read: A0A087W0E3
4080-th read: A0A091CUA7
5100-th read: A0A091IRM8
6120-th read: A0A091P9H5
7140-th read: A0A091UU47
8160-th read: A0A093GHP7
9180-th read: A0A093RY33
10200-th read: A0A099ZFK6
11220-th read: A0A0B1SQN2
12240-th read: A0A0C6G6Q4
13260-th read: A0A0F7DH26
14280-th read: A0A0K0CSW9
15300-th read: A0A0L7LBE2
16320-th read: A0A0N4TML8
17340-th read: A0A0N8JY23
18360-th read: A0A0P7XSR9
19380-th read: A0A0S2CC36
20400-th read: A0A0V1EIK6
21420-th read: A0A142BLT0
22440-th read: A0A161HKC6
23460-th read: A0A182F973
24480-th read: A0A182V6M9
25500-th read: A0A183TRZ0
26520-th read: A0A195DRH0
27540-th read: A0A1A8BRR1
28560-th read: A0A1A8MTI9
29580-th read: A0A1B0GEF6
30600-th read: A0A1B6MTP2
31620-th read: A0A1I7SD05
32640-th read: A0A1I8JQC7
33660-th read: A0A1L8HE61
34680-th read: A0A1S3G1A5


In [None]:
print(len(protein_list))

In [None]:
def get_sequence(url_addr):   
    #disable SSL verification to avoid config issues
    context = ssl._create_unverified_context()
    _, _, payload = try_get_payload(url_addr, context)
    if payload is None:
        continue
    curr_seq = payload["metadata"]["sequence"]
    time.sleep(1)
    return curr_seq

In [None]:
with open(os.path.join(DATA_DIR, 'sequences.txt'), 'w') as f:
    for protein_id in protein_list:
        start_url = PROTEIN_URL_PREFIX + protein_id
        seq = get_sequence(start_url)
        f.write(seq + '\n')

In [None]:
def get_pattern(url_addr):
    #disable SSL verification to avoid config issues
    context = ssl._create_unverified_context()
    _, _, payload = try_get_payload(url_addr, context, data_type='txt')
    if payload is None:
        return None
    time.sleep(1)

    # get raw pattern
    curr_pattern = ''
    start_patt = 'PA   '
    lines = txt_text.split('\n')
    for line in lines:
        if line.startswith(start_patt)
            curr_pattern += line[len(start_patt):]
    if len(curr_pattern) == 0:
        # contains no pattern; only matrix
        return None
    return curr_pattern[:-1]

In [None]:
# Now let us get queries
prosite_fname = 'prosite_ids.pkl'
if os.path.exists(prosite_fname):
    with open(prosite_fname, 'rb') as f:
        prosite_id_result = pickle.load(f)
else:
    prosite_id_result = get_family_full_list(PROSITE_LIST_URL, upper_limit=1000)
    with open(prosite_fname, 'wb') as f:
        pickle.dump(prosite_id_result, f)

In [None]:
raw_prosite_fname = 'raw_prosite.pkl'
if os.path.exists(raw_prosite_fname):
    with open(raw_prosite_fname, 'rb') as f:
        raw_prosite = pickle.load(f)
else:
    raw_prosite = []
    for prosite_id in prosite_list:
        start_url = PROSITE_URL_FORMAT.format(prosite_id)
        pat = get_pattern(start_url)
        raw_prosite.append(pat)
    with open(raw_prosite_fname, 'wb') as f:
        pickle.dump(raw_prosite, f)

In [None]:
def prosite_to_regex(raw_prosite):
    reg = raw_prosite.replace('-', '')
    reg = reg.replace('x', '.')
    reg = reg.replace('{', '[^')
    reg = reg.replace('}', ']')
    reg = reg.replace('(', '{')
    reg = reg.replace(')', '}')
    reg = reg.replace('<', '^')
    reg = reg.replace('>', '$')
    # special case
    if reg.endswith('$]'):
        reg = reg[:-2] + ']|\z'
    return reg

In [None]:
query_fn = os.path.join(DATA_DIR, 'prosites.txt')
if not os.path.exists(query_fn):
    with open(query_fn, 'w') as f:
        for pat in raw_prosite:
            pat_reg = prosite_to_regex(pat)
            f.write(pat_reg + '\n')