In [100]:
from urllib.request import urlopen
from urllib.error import HTTPError
import json
import logging

In [113]:
def pdb(id, resource="pdb", identifier=None, save=False):
    """
    Args:
    - id            One PDB ID (str).
    - resource      "pdb": Returns the protein structure in PDB format.
                    "entry": Information about PDB structures at the top level of PDB structure hierarchical data organization.
                    "pubmed": Get PubMed annotations (data integrated from PubMed) for a given entry's primary citation.
                    "assembly": Information about PDB structures at the quaternary structure level.
                    "branched_entity": Get branched entity description (define entity ID as "identifier").
                    "nonpolymer_entity": Get non-polymer entity data (define entity ID as "identifier").
                    "polymer_entity": Get polymer entity data (define entity ID as "identifier").
                    "uniprot": Get UniProt annotations for a given macromolecular entity (define entity ID as "identifier").
                    "branched_entity_instance": Get branched entity instance description (define chain ID as "identifier").
                    "polymer_entity_instance": Get polymer entity instance (a.k.a chain) data (define chain ID as "identifier").
                    "nonpolymer_entity_instance": Get non-polymer entity instance description (define chain ID as "identifier").

    -  identifier   None
                    or assembly ID (e.g. "1") (combine with resource="assembly")
                    or entity ID (e.g. "1") (combine with resource="branched_entity"/"nonpolymer_entity"/"polymer_entity"/"uniprot")
                    or chain (instance/asym) ID (e.g. "A") (combine with resource="branched_entity_instance"/"nonpolymer_entity_instance"/"polymer_entity_instance")

    Returns requested information in JSON format (except for resource="pdb" which returns protein structure in PDB format).
    """

    # Check if resource argument is valid
    resources = [
        "pdb",
        "entry",
        "pubmed",
        "assembly",
        "branched_entity",
        "nonpolymer_entity",
        "polymer_entity",
        "uniprot",
        "branched_entity_instance",
        "polymer_entity_instance",
        "nonpolymer_entity_instance",
    ]
    if resource not in resources:
        raise ValueError(
            f"'resource' argument specified as {resource}. Expected one of: {', '.join(resources)}"
        )

    # Check if required identifiers are present
    if resource == "assembly" and identifier is None:
        raise ValueError("Please define assembly ID (e.g. '1') as 'identifier'.")

    need_entitiy_id = [
        "branched_entity",
        "nonpolymer_entity",
        "polymer_entity",
        "uniprot",
    ]
    if resource in need_entitiy_id and identifier is None:
        raise ValueError("Please define entity ID (e.g. '1') as 'identifier'.")

    need_chain_id = [
        "branched_entity_instance",
        "nonpolymer_entity_instance",
        "polymer_entity_instance",
    ]
    if resource in need_chain_id and identifier is None:
        raise ValueError("Please define chain ID (e.g. 'A') as 'identifier'.")

    pdb_id = id
    
    # Define URLs for HTTP request
    if resource != "pdb":
        # URLs to request resources other than PDB file
        if identifier is not None:
            url = f"https://data.rcsb.org/rest/v1/core/{resource}/{pdb_id}/{identifier}"
        else:
            url = f"https://data.rcsb.org/rest/v1/core/{resource}/{pdb_id}"

    else:
        # URL to request PDB file
        url = f"https://files.rcsb.org/download/{pdb_id}.pdb"

    # Submit URL request
    try:
        r = urlopen(url)
    except HTTPError:
        if resource == "assembly":
            logging.error(f"{resource} for {pdb_id} assembly {identifier} was not found. Please double-check arguments and try again.")
        elif resource in need_entitiy_id:
            logging.error(f"{resource} for {pdb_id} entity {identifier} was not found. Please double-check arguments and try again.")
        elif resource in need_chain_id:
            logging.error(f"{resource} for {pdb_id} chain {identifier} was not found. Please double-check arguments and try again.")
        else:
            logging.error(f"{resource} for {pdb_id} was not found. Please double-check arguments and try again.")
        return

    if r.status != 200:
        raise RuntimeError(
            f"The RCSB server responded with status code: {r.status}. "
            "Please double-check arguments and try again.\n"
        )

    if resource != "pdb":
        # Read json formatted results
        results = json.load(r)
    else:
        # Read PDB file
        results = r.read().decode()

    if save:
        if resource != "pdb":
            # Save the results in json format
            if identifier is not None:
                out_name = f"{pdb_id}_{identifier}_{resource}.json"
            else:
                out_name = f"{pdb_id}_{resource}.json"

            with open(out_name, "w", encoding="utf-8") as f:
                json.dump(results, f, ensure_ascii=False, indent=4)

        else:
            # Save the PDB file
            with open(f"{pdb_id}.pdb", "w") as f:
                f.write(results)

    return results


In [144]:
pdb("4HHB", resource="pubmed", identifier=None)

{'rcsb_id': '6726807',
 'rcsb_pubmed_container_identifiers': {'pubmed_id': 6726807},
 'rcsb_pubmed_doi': '10.1016/0022-2836(84)90472-8',
 'rcsb_pubmed_abstract_text': 'The structure of human deoxyhaemoglobin was refined at 1.74 A resolution using data collected on film at room temperature from a synchrotron X-ray source. The crystallographic R-factor is 16.0%. The estimated error in atomic positions is 0.1 A overall, 0.14 A for main-chain atoms of internal segments, and 0.05 A for the iron atoms. The effects of intermolecular contacts on the structure were investigated; such contacts cause only highly localized distortions, as judged from the degree of molecular asymmetry that they induce. The geometry of the iron-nitrogen complex closely resembles that of the deoxymyoglobin structure of Takano (1977) and of the 5-co-ordinated model compounds of Hoard (1975) and Jameson et al. (1980). The distance of the iron from the mean plane of N(porphyrin) is 0.40(5) A and 0.36(5) A, respectively,