In [271]:
import sys, errno, re, json, ssl
from urllib import request
from urllib.error import HTTPError
from time import sleep
from pathlib import Path
from tqdm import tqdm
from urllib.request import urlretrieve
import concurrent.futures as cf
import itertools

import numpy as np
import json
import pypdb as p
from pypdb import pdb_client
from pypdb.clients.search.search_client import perform_search
from pypdb.clients.search.search_client import ReturnType
from pypdb.clients.search.operators import text_operators
from collections import defaultdict
import re
import requests
import os
from fix_data import progressbar, download_url

data_dir = Path("/home/pbarletta/labo/23/paco/data")
raw_data = data_dir / "raw"

In [191]:
def gen_payload(query):
    return {
        "query": {
            "type": "group",
            'logical_operator': 'and',
            'nodes': [
                {
                    "type": "terminal",
                    "service": "text",
                    "parameters": {"operator": "exact_match", "value": "Protein (only)", "attribute": "rcsb_entry_info.selected_polymer_entity_types"}
                },
                {
                    "type": "terminal",
                    "service": "text",
                    "parameters": {"attribute": "rcsb_entry_info.deposited_polymer_entity_instance_count", "operator": "equals", "value": 1}
                },
                *[
                    {
                        "type": "terminal",
                        "service": "text",
                        "parameters": {k:v for k,v in zip(['attribute','operator','value'], q)}
                    }
                for q in query],
            ],
        },
        "request_options": {
            "group_by": {
                "aggregation_method": "sequence_identity",
                "similarity_cutoff": 50,
            },
            "group_by_return_type": "representatives",
            "return_all_hits": True
        },
        "return_type": "polymer_entity"
    }

In [331]:
query = [("rcsb_entry_info.resolution_combined",
          "less_or_equal",
          2.8)]
payload = gen_payload(query)
r = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={json.dumps(payload)}")
response_dict = json.loads(r.text)
ids = [ id["identifier"].split("_")[0] for id in response_dict["result_set"]]

In [366]:
def download_pdbs(pdbs, data_dir, progress_bar=False):
    if progress_bar:
        gtor = tqdm(pdbs)
    else:
        gtor = pdbs
    for entry_id in gtor:
        entry_fn = f"{entry_id}.pdb.gz"
        linc = f"https://files.rcsb.org/download/{entry_fn}"
        try:
            urlretrieve(linc, data_dir / entry_fn)
        except HTTPError as e:
            print(entry_id, end='\t')
        except Exception as e:
            raise e

In [363]:
def download_pdbs_parallel(pdbs, data_dir, workers):
    with cf.ThreadPoolExecutor(max_workers=workers) as ex:
        futuros = []
        for chunk in np.array_split(pdbs, workers):
            futuros.append(ex.submit(download_pdbs, chunk, data_dir))
        
        with tqdm(total=workers) as pbar:
            for futu in cf.as_completed(futuros):
                if futu.exception():
                    raise futu.exception()
                pbar.update(1)

In [334]:
import time

start = time.time()
download_pdbs_parallel(ids, raw_data, len(ids) // 10)
time.time() - start

  0%|          | 0/2000 [00:00<?, ?it/s]


6TK8	7BK5	7QVD	7ZAF	8ANX	6XI3	6ZNB	8OPZ	7Q9R	7AV4	6T7N	6TBY	8ATJ	7MHE	7ZCD	6SGG	7AHW	8P1W	7NQG	7Q52	7OQD	6TVI	7NWO	7ATH	7TCV	7UMJ	7P24	6YGT	6T1C	7L82	7OMX	7ZUG	6Y0K	8A3K	7PJ4	7QNI	7KV2	7B0M	6TXQ	7OZ8	6TSB	7UC3	6SL9	7Q2T	7X5T	7M1B	7QD7	7NZC	7Z3M	7N50	8A5S	8AMT	6ZFQ	7OKR	8OO4	7PWI	7MY1	7WNS	6SWZ	7NVJ	6Z5O	7BAY	6ZM0	

KeyboardInterrupt: 

In [371]:
archivos_pdbgz = [ files for root, subdirs, files in os.walk(raw_data) ][0]
archivos_pdbgz = { pdb.split('.')[0] for pdb in archivos_pdbgz }

sobrantes = archivos_pdbgz - set(ids)
faltantes = set(ids) - archivos_pdbgz

In [374]:
download_pdbs_parallel(np.array(list(faltantes)), raw_data, 10)

  0%|          | 0/10 [00:00<?, ?it/s]

8ATJ	8ANX	7KV2	7PJ4	6Z5O	6ZN1	7ATH	7UC3	6T1C	7ATL	6ZM0	7X5T	6T7N	6T29	7AVC	7PWI	7Q2T	7B21	7NXZ	7NWO	7NXL	6TK8	6XI3	7B0M	7QAR	6T2R	7ZEN8OO4		7TNB	7KWC	7BK5	7O2Y	7MY1	8AOZ	7M1B	7NVJ	7U71	7ZC0	7ZAF	7BAY	6Y0K	7Q3C	6TSB	6YGT	7ZUL	7AV4	7N51	6TVI	8P1W	7NVK	7P26	7N50	7NSZ	7OQD6ZI0	8OQ1		7TCV	7ZUG	6TJU	6TVK	8A5S	7QVD	7Q9R	6SWZ	7ZCD	7NZC	6TXQ	7Z3M	7OMX	7OUJ	7QD7	6ZFQ	6SGG	7NZQ7MKR	7O2T	7Q52		6SL9	7OST	7MHE	7AHW	7QNI	7KV5	7OZ8	7NM8	6TWP	8OPZ	6TO3	7WNS	7KV4	7OKR	6Y0R	

 10%|█         | 1/10 [00:11<01:47, 12.00s/it]

7UMJ	7EVF	7P24	7NZD	7OZC	6ZNB	6YLZ	7ODJ	7L82	

 60%|██████    | 6/10 [00:12<00:06,  1.63s/it]

7NQG	6ZIN	8A3K	6TBY	

 90%|█████████ | 9/10 [00:13<00:01,  1.07s/it]

8AMT	

100%|██████████| 10/10 [00:14<00:00,  1.42s/it]


### por qué hay sobrantes?

In [379]:
# remove excess PDBs
[ Path(raw_data / f"{entry_id}.pdb.gz").unlink() for entry_id in sobrantes]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

-------------

## Older stuff

In [2]:
def get_pdbs_from_accession(accession_id: str):
  requestURL = "https://www.ebi.ac.uk/proteins/api/proteins/" + accession_id
  r = requests.get(requestURL, headers={ "Accept" : "application/json"})

  try:
    texto = json.loads(r.text)
    pdb_list = [ db['id'] for db in texto['dbReferences'] if db['type'] == 'PDB' ]
    return pdb_list
  except Exception as e:
    raise e

In [5]:
accession_id = "P04637"
pdb_list = get_pdbs_from_accession("P04637")
print(len(pdb_list))

240


In [107]:
record = defaultdict(list)
pattern_str_strict = re.compile(r"EXPRESSION_SYSTEM:")
pattern_nbr_strict = re.compile(r"EXPRESSION_SYSTEM_TAXID:")
pattern_str = re.compile(r"EXPRESSION_SYSTEM:.*\n")
pattern_nbr = re.compile(r"EXPRESSION_SYSTEM_TAXID:.*\n")

listarella = ['1A1U', '2OCJ']

for pdbid in pdb_list:
    chunk = pdb_client.get_pdb_file(pdbid, pdb_client.PDBFileType.PDB)
    beg_str = re.search(pattern_str, chunk)
    beg_nbr = re.search(pattern_nbr, chunk)
    
    system = ""
    taxid = 0
    if beg_str:
        idx_system_raw = re.search(pattern_str_strict, beg_str.group()).span()[1]
        system = beg_str.group()[idx_system_raw:].strip().removesuffix(';')
    if beg_nbr:
        idx_taxid_raw = re.search(pattern_nbr_strict, beg_nbr.group()).span()[1]
        taxid = beg_nbr.group()[idx_taxid_raw:].strip().removesuffix(';')
    
    record[(system, taxid)].append(pdbid)

Sending GET request to https://files.rcsb.org/download/1A1U.pdb to fetch 1A1U's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1AIE.pdb to fetch 1AIE's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1C26.pdb to fetch 1C26's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1DT7.pdb to fetch 1DT7's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1GZH.pdb to fetch 1GZH's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1H26.pdb to fetch 1H26's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1HS5.pdb to fetch 1HS5's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1JSP.pdb to fetch 1JSP's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1KZY.pdb to fetch 1KZY's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1MA3.pdb to fetch 1MA3's pd

In [108]:
record

defaultdict(list,
            {('ESCHERICHIA COLI', '562'): ['1A1U',
              '1C26',
              '1DT7',
              '1GZH',
              '1H26',
              '1HS5',
              '1MA3',
              '1TSR',
              '1TUP',
              '1UOL',
              '1YC5',
              '1YCQ',
              '1YCR',
              '1YCS',
              '2AC0',
              '2ADY',
              '2AHI',
              '2ATA',
              '2BIM',
              '2BIN',
              '2BIO',
              '2BIP',
              '2BIQ',
              '2FEJ',
              '2GS0',
              '2H1L',
              '2H2D',
              '2H2F',
              '2H59',
              '2J1W',
              '2J1X',
              '2J1Y',
              '2J1Z',
              '2J20',
              '2J21',
              '2K8F',
              '2L14',
              '2LY4',
              '2MWO',
              '2MWP',
              '2MZD',
              '2RUK',
              '2WGX',
       

In [103]:
record

defaultdict(list,
            {('ESCHERICHIA COLI', '562'): '1A1U',
             ('ESCHERICHIA COLI BL21(DE3)', '469008;'): '2OCJ'})

In [95]:
re.search(pattern_nbr_strict, beg_nbr.group()).span()[1]

24

In [96]:
taxid

0

In [None]:
for i, j in record:
    print(i, j, record[i,j])

for i in range(0,75):
    print('-o', end='')
print()
dict_id_amount = dict()
for i, j in record:
    dict_id_amount[i,j] = len(record[i, j])
    print(i, j + '--> ', dict_id_amount[i,j])

---

In [24]:
BASE_URL = "https://www.ebi.ac.uk:443/interpro/api/protein/UniProt/structure/PDB/" +\
  "1afv/?id=&page_size=200"

def output_list():
  #disable SSL verification to avoid config issues
  context = ssl._create_unverified_context()

  next = BASE_URL
  last_page = False

  
  #json header
  sys.stdout.write("{ \"results\": [\n")
  
  attempts = 0
  while next:
    try:
      req = request.Request(next, headers={"Accept": "application/json"})
      res = request.urlopen(req, context=context)
      # If the API times out due a long running query
      if res.status == 408:
        # wait just over a minute
        sleep(61)
        # then continue this loop with the same URL
        continue
      elif res.status == 204:
        #no data so leave loop
        break
      payload = json.loads(res.read().decode())
      next = payload["next"]
      attempts = 0
      if not next:
        last_page = True
    except HTTPError as e:
      if e.code == 408:
        sleep(61)
        continue
      else:
        # If there is a different HTTP error, it wil re-try 3 times before failing
        if attempts < 3:
          attempts += 1
          sleep(61)
          continue
        else:
          sys.stderr.write("LAST URL: " + next)
          raise e

    for i, item in enumerate(payload["results"]):
      
      sys.stdout.write(json.dumps(item))
      # for indented output replace the above line with the following
      # sys.stdout.write(json.dumps(item, indent=4))
      # for 1 record per line uncomment the following line
      # sys.stdout.write("\n")

      if last_page and i+1 == len(payload["results"]):
        sys.stdout.write("")
      else:
        sys.stdout.write(",\n")
    
  return payload

In [25]:
a = output_list()

{ "results": [
{"metadata": {"accession": "P12497", "name": "Gag-Pol polyprotein", "source_database": "reviewed", "length": 1435, "source_organism": {"taxId": "11698", "scientificName": "Human immunodeficiency virus type 1 group M subtype B (isolate NY5)", "fullName": "Human immunodeficiency virus type 1 group M subtype B (isolate NY5) (HIV-1)"}}, "structures": [{"structure_protein_locations": [{"fragments": [{"start": 133, "end": 283}]}], "protein_structure_mapping": {"A": [{"protein_start": 133, "protein_end": 283, "structure_start": 1, "structure_end": 151, "author_structure_start": 1, "author_structure_end": 151}], "B": [{"protein_start": 133, "protein_end": 283, "structure_start": 1, "structure_end": 151, "author_structure_start": 1, "author_structure_end": 151}]}, "organism": {"taxid": "11698"}, "accession": "1afv", "chain": "A", "protein_length": 1435, "resolution": 3.7, "experiment_type": "x-ray", "source_database": "pdb", "entry_protein_locations": [{"fragments": [{"start": 27

In [20]:
b = a["results"][0]

In [23]:
b["metadata"]["accession"]

'P00533'

In [31]:
a["results"][0]["metadata"]["accession"], a["results"][1]["metadata"]["accession"]

('P12497', 'Q99LC4')

In [32]:
a["results"][0]["metadata"]

{'accession': 'P12497',
 'name': 'Gag-Pol polyprotein',
 'source_database': 'reviewed',
 'length': 1435,
 'source_organism': {'taxId': '11698',
  'scientificName': 'Human immunodeficiency virus type 1 group M subtype B (isolate NY5)',
  'fullName': 'Human immunodeficiency virus type 1 group M subtype B (isolate NY5) (HIV-1)'}}

In [33]:
a["results"][1]["metadata"]

{'accession': 'Q99LC4',
 'name': 'Igh protein',
 'source_database': 'unreviewed',
 'length': 463,
 'source_organism': {'taxId': '10090',
  'scientificName': 'Mus musculus',
  'fullName': 'Mus musculus (Mouse)'}}