In [None]:
from src.UniProt import UniprotRecord, searchUniprot
import pandas as pd 

df = pd.read_csv('./data/uniprotIDs.csv')

# REtrieve pdb ID 
for id in df.id.values:
    u = searchUniprot(id)
    pdb = u.getPDBs()
    pdbids = pdb.keys()
    break

ModuleNotFoundError: No module named 'src.UniProt'

In [None]:
# -*- coding: utf-8 -*-
"""This module defines a class and relative functions for mapping Uniprot
sequences to PDB and Pfam databases."""
from rcsbapi.data import DataQuery as Query
import os
import re
import dill as pickle
import datetime
import time
import numpy as np
import urllib.parse
import requests 
import re
import traceback

import prody
from prody import parsePDB, Atomic, queryUniprot
from prody.utilities import openURL
from Bio.pairwise2 import align as bioalign
from Bio.pairwise2 import format_alignment

from prody import LOGGER

def searchUniprot(id):
    """Search Uniprot with *id* and return a :class:`UniprotRecord` containing the results. 
    """
    def _queryUniprot(*args, n_attempts=3, dt=1, **kwargs):
        """
        Redefine prody function to check for no internet connection
        """
        attempt = 0
        while attempt < n_attempts:
            try:
                _ = openURL('http://www.uniprot.org/')
                break
            except:
                LOGGER.info(
                    f'Attempt {attempt} to contact www.uniprot.org failed')
                attempt += 1
                time.sleep((attempt+1)*dt)
        else:
            _ = openURL('http://www.uniprot.org/')
        return queryUniprot(*args, **kwargs)

    data = _queryUniprot(id)
    return UniprotRecord(data)

comma_splitter = re.compile(r'\s*,\s*').split
ns = {'up': 'http://uniprot.org/uniprot'}
    
class UniprotRecord(object):
    """This class provides a wrapper for UniProt data including functions 
    for accessing particular fields and parsing associated PDB entries."""
    def __init__(self, data):
        self._rawdata = data
        self._pdbdata = []
        self._parse()

    def __repr__(self):
        return '<UniprotRecord: %s>'%self.getTitle()

    def __str__(self):
        return self.getTitle()

    def setData(self, value):
        self._rawdata = value
        self._parse()

    def getData(self):
        return self._rawdata

    def getPDBs(self):
        return self._pdbdata
    
    def getAccession(self, index=0):
        """accession tag"""
        return self.getEntry('accession', index)
    
    def getName(self, index=0):
        """name tag"""
        return self.getEntry('name', index)

    def getProtein(self, index=0):
        """protein tag"""
        protein = self.getEntry('protein', index)
        protein = protein.find('up:recommendedName/up:fullName', ns)
        if protein is not None:
            return protein.text

    def getGene(self, index=0):
        """gene tag"""
        pass

    def getOrganism(self, index=0):
        """organism tag"""
        pass

    def getReference(self, index=0):
        """reference tag"""
        pass
    
    def getComment(self, index=0):
        """conmment tag"""
        pass
        
    def getDBreference(self, index=0):
        """dbReference tag"""
        pass
    
    def getProteinExistence(self, index=0):
        """proteinExistence tag"""
        pass
     
    def getKeyword(self, index=0):
        """keyword tag"""
        pass
    
    def getFeature(self, index=0):
        """feature tag"""
        pass
    
    def getEvidence(self, index=0):
        """evidence tag"""
        pass
    
    def getSequence(self, index=0):
        return self.getEntry('sequence', index)
    
    def getZincFinger(self):
        return self._zinc_finger
    
    def getDNAbinding(self):
        return self._dna_binding
    
    def getActivateSite(self):
        return self._active_site
    
    def getBindingSite(self):
        return self._binding_site
    
    def getCofactor(self):
        return self._cofactors
    
    def getTitle(self):
        uid = self.getAccession()
        name = self.getName()
        return '%s (%s)'%(uid, name)

    def getEntry(self, item, index=0):
        key = '%s%4d'%(item, index)
        if key in self._rawdata:
            return self._rawdata[key]
        else:
            raise KeyError('%s does not exist in the Uniprot record'%key)

    def _parseDNAbinding(self):
        data = self._rawdata
        dna_binding = []
        for key, value in data.items():
            if not key.startswith('feature'):
                continue
            if value.get('type') != "DNA-binding region":
                continue
            """
            <feature type="DNA-binding region" description="HMG box 1" evidence="4">
                <location>
                <begin position="9"/>
                <end position="79"/>
                </location>
            </feature>
            """
            descp = value.get('description')
            begin_elem = value.find('up:location/up:begin', ns)
            end_elem = value.find('up:location/up:end', ns)
            begin = begin_elem.attrib.get('position') if begin_elem is not None else None
            end = end_elem.attrib.get('position') if end_elem is not None else None
            dna_binding.append({
                'description': descp, 
                'begin': begin, 
                'end': end
            })
        self._dna_binding = dna_binding
            
    def _parseZincfinger(self):
        data = self._rawdata
        zinc_finger = []
        for key, value in data.items():
            if not key.startswith('feature'):
                continue
            if value.get('type') != "zinc finger region":
                continue
            """
            <feature type="zinc finger region" description="C2H2-type 1" evidence="1">
                <location>
                <begin position="110"/>
                <end position="133"/>
                </location>
            </feature>
            """
            descp = value.get('description')
            begin_elem = value.find('up:location/up:begin', ns)
            end_elem = value.find('up:location/up:end', ns)
            begin = begin_elem.attrib.get('position') if begin_elem is not None else None
            end = end_elem.attrib.get('position') if end_elem is not None else None
            zinc_finger.append({
                'description': descp, 
                'begin': begin, 
                'end': end
            })
        self._zinc_finger = zinc_finger

    def _parseActiveSite(self):
        data = self._rawdata
        active_site = []
        for key, value in data.items():
            if not key.startswith('feature'):
                continue
            
            if value.get('type') != "active site":
                continue
            """
            <feature type="active site" description="Proton donor" evidence="2">
                <location>
                <position position="613"/>
                </location>
            </feature>
            """
            descp = value.get('description')
            pos_elem = value.find('up:location/up:position', ns)
            pos   = int(pos_elem.attrib.get('position')) if pos_elem is not None else None
            active_site.append({
                'description': descp, 
                'position': pos
            })
        self._active_site = active_site
    
    def _parseBindingSite(self):
        data = self._rawdata
        binding_site = []
        for key, value in data.items():
            if not key.startswith('feature'):
                continue
            
            if value.get('type') != "binding site":
                continue
            
            """
            <feature type="binding site" evidence="7 9 22 23 24">
                <location>
                <position position="617"/>
                </location>
                <ligand>
                <name>Zn(2+)</name>
                <dbReference type="ChEBI" id="CHEBI:29105"/>
                </ligand>
            </feature>
            """
            pos_elem = value.find('up:location/up:position', ns)
            pos   = int(pos_elem.attrib.get('position')) if pos_elem is not None else None
            
            ligand_elem = value.find('up:ligand', ns)
            ligand_name = ligand_elem.find('up:name', ns)
            ligand_name = ligand_name.text if ligand_name is not None else None
            ligand_chebi= ligand_elem.find('up:dbReference[@type="ChEBI"]', ns)
            ligand_chebi = ligand_chebi.attrib['id'] if ligand_chebi is not None else None
            binding_site.append({
                'position': pos, 
                'name': ligand_name, 
                'chebi': ligand_chebi
            })
        self._binding_site = binding_site
    
    def _parseCofactor(self):
        data = self._rawdata
        cofactors = []
        for key, value in data.items():
            if not key.startswith('comment'):
                continue
            
            if value.get('type') != "cofactor":
                continue
            """
            <comment type="cofactor">
                <cofactor evidence="2">
                    <name>cf_name</name>
                    <dbReference type="ChEBI" id=cf_chebi/>
                </cofactor>
            </comment>
            """
            cf_elem = value.find('up:cofactor', ns)
            # ---
            cf_name = cf_elem.find('up:name', ns)
            cf_chebi= cf_elem.find('up:dbReference[@type="ChEBI"]', ns)
            cf_chebi = cf_chebi.attrib['id'] if cf_chebi is not None else None
            # ---
            cofactors.append({
                'name': cf_name.text, 
                'chebi': cf_chebi
            })
        self._cofactors = cofactors
    
    def _parseLigand(self, pdblist):
        """
        Fetch data from RCSB graphQL using Data API
        https://rcsbapi.readthedocs.io/en/latest/data_api/quickstart.html
        
        # Available return_data_list fields: 
        # https://data.rcsb.org/data-attributes.html 
        """
        
        query = Query(
            input_type="entries",
            input_ids=pdblist,
            return_data_list=[
                "nonpolymer_entities.pdbx_entity_nonpoly.comp_id",
            ]
        )
        r = query.exec()
        
        ligands = {}
        for entry in r['data']['entries']:
            pdbid = entry['rcsb_id']
            ligand_list = entry.get('nonpolymer_entities', None)
            # Return None if no ligand
            if ligand_list is None:
                ligands[pdbid] = None
                continue

            ligand = [
                entity['pdbx_entity_nonpoly']['comp_id'] \
                for entity in ligand_list
            ]
            ligands[pdbid] = ligand
        return ligands
    
    def _parse(self):
        data = self._rawdata
        PDBdata = {}
        for key, value in data.items():
            if not key.startswith('dbReference'):
                continue
            try:
                pdbid = value['PDB']
            except (KeyError, TypeError) as e:
                continue
            """
            <dbReference type="PDB" id=pdbid>
                <property type="method" value="EM"/>
                <property type="resolution" value=resolution/>
                <property type="chains" value=pdbchains/>
    		</dbReference>
            """
            method = value['method']
            pdbchains = value['chains'] # e.g. "B/D/F/G/H/I=1-450"
            resolution = value.get('resolution', '1.00 A')
            resolution = float(resolution.split(' ')[0])
            
            # example chain strings: "A=27-139, B=140-150" or "A/B=27-150"
            chains = []
            pdbchains = comma_splitter(pdbchains)
            for chain in pdbchains:
                chids, resrange = chain.split('=')
                chids = [chid.strip() for chid in chids.split('/')]
                for chid in chids:
                    chains.append(chid)
                    
            PDBdata[pdbid] = {
                'method': method,
                'resolution': resolution,
                'chains': chains,
                'resrange': resrange,
            }

        pdblist = list(PDBdata.keys())
        if len(pdblist) != 0:
            ligands = self._parseLigand(pdblist)
            for pdbid in PDBdata:
                PDBdata[pdbid]['ligand'] = ligands[pdbid]

        self._parseActiveSite()
        self._parseBindingSite()
        self._parseCofactor()
        self._parseDNAbinding()
        self._parseZincfinger()
        
        self._pdbdata = PDBdata

In [None]:
from src.UniProt import searchUniprot

id = 'O76074'
u = searchUniprot(id)
pdb = u.getPDBs()

In [322]:
u.getCofactor(), pdb

(["pyridoxal 5'-phosphate"],
 {'1D7K': {'method': 'X-ray',
   'resolution': 2.1,
   'chains': ['A', 'B'],
   'resrange': '7-427',
   'ligand': None},
  '2ON3': {'method': 'X-ray',
   'resolution': 3.0,
   'chains': ['A', 'B'],
   'resrange': '1-461',
   'ligand': ['XAP']},
  '2OO0': {'method': 'X-ray',
   'resolution': 1.9,
   'chains': ['A', 'B'],
   'resrange': '1-461',
   'ligand': ['PLP', 'XAP', 'ACT', 'N2P']},
  '4ZGY': {'method': 'X-ray',
   'resolution': 2.63,
   'chains': ['A'],
   'resrange': '2-421',
   'ligand': ['PLP', 'MG']},
  '5BWA': {'method': 'X-ray',
   'resolution': 3.2,
   'chains': ['A'],
   'resrange': '1-461',
   'ligand': ['PLP']},
  '7S3F': {'method': 'X-ray',
   'resolution': 2.49,
   'chains': ['A', 'B'],
   'resrange': '1-424',
   'ligand': ['PLP', 'XAP']},
  '7S3G': {'method': 'X-ray',
   'resolution': 1.66,
   'chains': ['A', 'B'],
   'resrange': '1-424',
   'ligand': ['PLP', 'CIT']},
  '7U6P': {'method': 'X-ray',
   'resolution': 2.35,
   'chains': ['A', 

In [377]:
u._active_site

[{'description': 'Proton donor', 'position': 613}]

In [386]:
from rcsbapi.data import DataQuery
query = DataQuery(
    input_type="entries",
    input_ids=['1RKP', '1T9R'],
    return_data_list=[
        "nonpolymer_entities.pdbx_entity_nonpoly.comp_id",
        "nonpolymer_entities.pdbx_entity_nonpoly.name",
        "nonpolymer_entities.rcsb_nonpolymer_entity.pdbx_description",
    ]
)
r = query.exec()
r


{'data': {'entries': [{'rcsb_id': '1RKP',
    'nonpolymer_entities': [{'pdbx_entity_nonpoly': {'comp_id': 'ZN',
       'name': 'ZINC ION'},
      'rcsb_nonpolymer_entity': {'pdbx_description': 'ZINC ION'}},
     {'pdbx_entity_nonpoly': {'comp_id': 'MG', 'name': 'MAGNESIUM ION'},
      'rcsb_nonpolymer_entity': {'pdbx_description': 'MAGNESIUM ION'}},
     {'pdbx_entity_nonpoly': {'comp_id': 'IBM',
       'name': '3-ISOBUTYL-1-METHYLXANTHINE'},
      'rcsb_nonpolymer_entity': {'pdbx_description': '3-ISOBUTYL-1-METHYLXANTHINE'}}]},
   {'rcsb_id': '1T9R',
    'nonpolymer_entities': [{'pdbx_entity_nonpoly': {'comp_id': 'ZN',
       'name': 'ZINC ION'},
      'rcsb_nonpolymer_entity': {'pdbx_description': 'ZINC ION'}},
     {'pdbx_entity_nonpoly': {'comp_id': 'PO4', 'name': 'PHOSPHATE ION'},
      'rcsb_nonpolymer_entity': {'pdbx_description': 'PHOSPHATE ION'}},
     {'pdbx_entity_nonpoly': {'comp_id': 'CIT', 'name': 'CITRIC ACID'},
      'rcsb_nonpolymer_entity': {'pdbx_description': 'CITRI

In [326]:
query = Query(
    input_type="entries",
    input_ids=list(u._pdbdata.keys()),
    return_data_list=[
        "nonpolymer_entities.pdbx_entity_nonpoly.comp_id",
        "polymer_entities.rcsb_target_cofactors.cofactor_SMILES",
    ]
)
r = query.exec()
r

{'data': {'entries': [{'rcsb_id': '1D7K',
    'nonpolymer_entities': None,
    'polymer_entities': [{'rcsb_target_cofactors': [{'cofactor_SMILES': 'N[C@@H](CCCNC(=O)CCl)C(=O)O'},
       {'cofactor_SMILES': 'CNCCCC(C)(N)C(=O)O'},
       {'cofactor_SMILES': 'C[C@@](N)(CCCN)C(=O)O.Cl'},
       {'cofactor_SMILES': 'NCCCCNCCCN'},
       {'cofactor_SMILES': 'CC(N)(CCCNCCCN)C(=O)O'},
       {'cofactor_SMILES': 'CC(N)(CCCNCCCN)C(=O)O.Cl'},
       {'cofactor_SMILES': 'CC1=NC=C(COP(O)(O)=O)C(C=O)=C1O'},
       {'cofactor_SMILES': 'NCCCNCCCCNCCCN'},
       {'cofactor_SMILES': 'CCCC1=NN(C)C2=C1N=C(NC2=O)C1=CC(=CC=C1OCC)S(=O)(=O)N1CCN(C)CC1'},
       {'cofactor_SMILES': 'CCCC(CCC)C(O)=O'},
       {'cofactor_SMILES': 'NCCCCN'},
       {'cofactor_SMILES': 'CC1=NC=C(COP(O)(O)=O)C(CO)=C1O'},
       {'cofactor_SMILES': 'CC1=NC=C(COP(O)(O)=O)C(CNCC(O)=O)=C1O'},
       {'cofactor_SMILES': 'NCCC[C@@](N)(C(F)F)C(O)=O'},
       {'cofactor_SMILES': 'CC1=NC=C(COP(O)(O)=O)C(\\C=N\\CCCC[C@H](N)C(O)=O)=C1O'},
   

In [339]:
from rcsbapi.data import DataQuery as Query

query = Query(
    input_type="chem_comps",
    input_ids=["PLP"],
    return_data_list=[
        "chem_comps.rcsb_id",
        "chem_comp.name",
        "pdbx_chem_comp_descriptor.comp_id",
        "pdbx_chem_comp_descriptor.descriptor"
    ]
)
result_dict = query.exec()
result_dict

{'data': {'chem_comps': [{'rcsb_id': 'PLP',
    'chem_comp': {'name': "PYRIDOXAL-5'-PHOSPHATE"},
    'pdbx_chem_comp_descriptor': [{'comp_id': 'PLP',
      'descriptor': 'O=P(O)(O)OCc1cnc(c(O)c1C=O)C'},
     {'comp_id': 'PLP', 'descriptor': 'Cc1ncc(CO[P](O)(O)=O)c(C=O)c1O'},
     {'comp_id': 'PLP', 'descriptor': 'Cc1ncc(CO[P](O)(O)=O)c(C=O)c1O'},
     {'comp_id': 'PLP', 'descriptor': 'Cc1c(c(c(cn1)COP(=O)(O)O)C=O)O'},
     {'comp_id': 'PLP', 'descriptor': 'Cc1c(c(c(cn1)COP(=O)(O)O)C=O)O'},
     {'comp_id': 'PLP',
      'descriptor': 'InChI=1S/C8H10NO6P/c1-5-8(11)7(3-10)6(2-9-5)4-15-16(12,13)14/h2-3,11H,4H2,1H3,(H2,12,13,14)'},
     {'comp_id': 'PLP', 'descriptor': 'NGVDGCNFYWLIFO-UHFFFAOYSA-N'}]}]}}

In [None]:
data = u._rawdata
cofactors = []
for key, value in data.items():
    if not key.startswith('comment'):
        continue
    
    if value.get('type') != "cofactor":
        continue
    """
    <comment type="cofactor">
        <cofactor evidence="2">
            <name>cf_name</name>
            <dbReference type="ChEBI" id=cf_chebi/>
        </cofactor>
    </comment>
    """
    cf_elem = value.find('up:cofactor', ns)
    # ---
    cf_name = cf_elem.find('up:name', ns)
    cf_chebi= cf_elem.find('up:dbReference[@type="ChEBI"]', ns)
    cf_chebi = cf_chebi.attrib['id'] if cf_chebi is not None else None
    # ---
    cofactors.append({'name': cf_name.text, 'chebi': cf_chebi})

cofactors

pyridoxal 5'-phosphate CHEBI:597326


[{'name': "pyridoxal 5'-phosphate", 'chebi': 'CHEBI:597326'}]

In [366]:
# <feature type="zinc finger region" description="C2H2-type 1" evidence="1">
# <location>
# <begin position="110"/>
# <end position="133"/>
# </location>
# </feature>

data = u._rawdata
zinc_finger = []
for key, value in data.items():
    if not key.startswith('feature'):
        continue
    
    if value.get('type') != "zinc finger region":
        continue
    
    descp = value.get('description')

    begin_elem = value.find('up:location/up:begin', ns)
    end_elem = value.find('up:location/up:end', ns)

    begin = begin_elem.attrib.get('position') if begin_elem is not None else None
    end = end_elem.attrib.get('position') if end_elem is not None else None

    zinc_finger.append({
        'description': descp, 'begin': begin, 'end': end
    })
zinc_finger

[{'description': 'C2H2-type 1', 'begin': '110', 'end': '133'},
 {'description': 'C2H2-type 2', 'begin': '138', 'end': '160'},
 {'description': 'C2H2-type 3', 'begin': '169', 'end': '192'}]