In [369]:
import pandas as pd 
import json
data_json = 'data/parseUniprot_881.json' 
# 874 UniProt 
# Plus P0A7V8, P0A7X3
# A2VLV3 E1FVX6 E1G1C3 A0A0D9MXW1

with open(data_json, "r") as f:
    data_json = json.load(f)
    
with open('data/parseUniprot_site_881.json', "r") as f:
    addon =  json.load(f)
    
    

In [370]:
_dict = {}
for entry in addon:
    id = entry['id']
    _dict[id] = {'binding_site': entry['binding_site'], 'site': entry['site']}

In [371]:
for i in range(len(data_json)):
    entry = data_json[i]
    id = entry['id']
    entry['binding_site'] = _dict[id]['binding_site']
    entry['site'] = _dict[id]['site']

In [6]:
import pandas as pd
from src.utils.logger import LOGGER
from src.UniProt import filter_essence, COFACTOR, COFACTOR_COENZYME

def pickPDBfromUniprot(entry, to_file=None):
    data = filter_essence(entry)
    id = data['id'] # uniprot id
    seq_len = len(data['sequence']) # uniprot protein length
    
    # Cofactor
    uni_cofactor = data['uni_cofactor']
    uni_cofactor_chebi = [cf['chebi'] for cf in uni_cofactor]
    uni_cofactor_id = []
    for l in uni_cofactor_chebi:
        uni_cofactor_id.extend(COFACTOR[l])
    if len(uni_cofactor_id) == 0:
        LOGGER.info(f"{id}, no cofactor")
    else: 
        LOGGER.info(f"{id}, cofactor(s):")
        for i, cof in enumerate(uni_cofactor):
            LOGGER.info(f"{cof['name']}, {cof['chebi']}, {uni_cofactor_id[i]}")
    
    # Functional site
    _func_site_ = []
    for sname in ['active_site', 'binding_site', 'dna_binding', 'zinc_finger']:
        _site = data[sname]
        if len(_site) != 0:
            _func_site_.extend(_site)
            LOGGER.info(f"{sname}: {_site}")
    
    # lst = ['280', '99-101', '103-105']
    # out: [280, 99, 100, 101, 103, 104, 105]
    func_site = []
    for item in _func_site_:
        item = str(item)
        if '-' in item:
            start, end = map(int, item.split('-'))
            func_site.extend(range(start, end + 1))
        else:
            func_site.append(int(item))
    
    # PDB structures
    pdb = data.get('pdb')
    if len(pdb) == 0:
        LOGGER.warn(f"{id} has no PDB structures")
    
    # Retrieve key information to rank PDB structures
    PDBrank = []
    for pdbid, entry in pdb.items():
        resolution = entry['resolution']
        pdb_ligand = entry['ligand']
        r_free = entry['ls_R_factor_R_free']
        seq_annot = entry['seq_annot']
        resrange = entry['resrange']
        
        # Retrieve ligand id
        if pdb_ligand is not None:
            pdb_ligand_id = [l['comp_id'] for l in pdb_ligand]
        else:
            pdb_ligand_id = []
            
        # Count number of _cofactors = cofactor coenzyme + cofactor ion
        # Count number of (ligands+drugs) other than _cofactors
        _cofactors = set(pdb_ligand_id).intersection(set(uni_cofactor_id))
        _coenzymes = set(pdb_ligand_id).intersection(set(COFACTOR_COENZYME))
        _cofactors = _cofactors.union(_coenzymes)
        _ligands_or_drugs = set(pdb_ligand_id).difference(set(uni_cofactor_id))
        
        # resolved_len with respect to uniprot sequence e.g., '100-250'
        if resrange is None:
            resolved_len = 0
            LOGGER.warn(f"{pdbid} has no resolved range")
        else:
            _split = resrange.split('-')
            resolved_len = int(_split[1]) - int(_split[0]) + 1
        
        # Iterate each chain (instance) recorded in uniprot
        for inst_id, value in seq_annot.items():
            n_mut = value['rcsb_mutation_count']
            bas_id = value['biological_assembly']
            
            # dict, key: missing pdb resID, value: uniprot ID 
            # e.g., (1, 12): [(24, 30)] 
            mapped = value['mapped'] 
            
            # modeled_len is resolved_len substracted missing residues
            n_missing_res = 0
            n_missing_fsite = 0
            for _, m in mapped.items():
                if len(m) == 0:
                    continue
                _range = m[0]
                missing_range = range(_range[0], _range[1])
                missing_fsite = set(func_site).intersection(set(missing_range))
                n_missing_res += len(missing_range)
                n_missing_fsite += len(missing_fsite)
                     
            modeled_len = resolved_len - n_missing_res
            coverage = modeled_len / seq_len
            PDBrank.append(
                (
                    id, uni_cofactor_id, seq_len, # 0, 1, 2
                    pdbid, inst_id, bas_id, pdb_ligand_id, # 3, 4, 5, 6
                    _cofactors, len(_cofactors), # 7, 8
                    _ligands_or_drugs, len(_ligands_or_drugs), # 9, 10
                    resrange, modeled_len, n_mut, n_missing_fsite, coverage, # 11, 12, 13, 14, 15
                    resolution, r_free, # 16, 17
                    # Extra columns
                    data['active_site'], data['binding_site'], data['dna_binding'], data['zinc_finger'],
                    list(mapped.keys()), list(mapped.values()), 
                )
            )
    # len(_cofactors) → n_missing_fsite 
    # coverage → resolution → r_free →
    # → n_mut → len(_ligands_or_drugs)
    PDBrank.sort(key=lambda x: (-x[8], -x[14], 
                                -x[15], x[16], x[17], x[10], x[13], 
                                )) # Smallest score first    
    # Save to csv file
    if to_file is not None:
        columns = [
            'id', 'uni_cofactor', 'uni_seq_len',
            'pdbid', 'chid', 'basid', 'all_pdb_ligands',
            'pdb_cofactors', 'n_pdb_cofactors', 
            'pdb_ligand_or_drugs', 'n_pdb_ligand_or_drugs',
            'resrange', 'modeled_len', 'n_mut', 'n_missing_fsite', 'coverage', 
            'resolution', 'r_free',
            
            # Extra columns
            'active_site', 'binding_site', 'dna_binding', 'zinc_finger',
            'missing_pdb_resID', 'missing_uniprot_resID',
        ]
        df = pd.DataFrame(columns=columns, data=PDBrank)
        df.to_csv(to_file, index=False)
    return PDBrank

# PDBrank = pickPDBfromUniprot(data)

In [7]:

from src.UniProt import searchUniprot

idlist = ['P05106','P05093','O14764','P18505','P08588','P23219','P19793','P35367','P31645','A8TX70','P02462']
# idlist = ['P35367']
#######

####### Parsing
data = []
for id in idlist:
    try:
        u = searchUniprot(id)
        data = {
            'id': u.getAccession(),
            'name': u.getName(),
            'protein': u.getProtein(),
            'gene': u.getGene(),
            'organism': u.getOrganism(),
            'sequence': u.getSequence(),
            'cell_location': u.getCellLocation(),
            'cofactor': u.getCofactor(),
            'binding_site': u.getBindingSite(),
            'active_site': u.getActiveSite(),
            'dna_binding': u.getDNAbinding(),
            'zinc_finger': u.getZincFinger(),
            'site': u.getSite(),
            'pdb': u.getPDBs(),
            'alphafold': u.getAlphaFold(),
            }
        PDBrank = pickPDBfromUniprot(data, to_file=f'data/{id}_PDBrank.csv')
    except Exception as e:
        LOGGER.warn(f'Error while parsing {id}: {e}')
        continue

@> Parse UniProt information of P05106...
2025-08-18 21:27:24,332 [INFO]-logger.info: Parse UniProt information of P05106...
@> Parsing in 82.5s.
2025-08-18 21:28:46,880 [DEBUG]-logger.debug: Parsing in 82.5s.
@> P05106, no cofactor
2025-08-18 21:28:46,883 [INFO]-logger.info: P05106, no cofactor
@> binding_site: ['147', '149', '149', '152', '153', '184', '241', '243', '245', '246', '246', '277', '277', '361']
2025-08-18 21:28:46,884 [INFO]-logger.info: binding_site: ['147', '149', '149', '152', '153', '184', '241', '243', '245', '246', '246', '277', '277', '361']
@> Parse UniProt information of P05093...
2025-08-18 21:28:52,935 [INFO]-logger.info: Parse UniProt information of P05093...
@> Parsing in 14.3s.
2025-08-18 21:29:07,216 [DEBUG]-logger.debug: Parsing in 14.3s.
@> P05093, cofactor(s):
2025-08-18 21:29:07,218 [INFO]-logger.info: P05093, cofactor(s):
@> heme, CHEBI:30413, HEM
2025-08-18 21:29:07,219 [INFO]-logger.info: heme, CHEBI:30413, HEM
@> binding_site: ['202', '442']
2025-0

In [193]:
### Common info.
id = [filter_essence(entry)['id'] for entry in data_json]
pdbid = [filter_essence(entry)['pdbid'] for entry in data_json]
uni_cofactor = [filter_essence(entry)['uni_cofactor'] for entry in data_json]
protein = [filter_essence(entry)['protein'] for entry in data_json]
gene = [filter_essence(entry)['gene'] for entry in data_json]
organism = [filter_essence(entry)['organism'] for entry in data_json]
alphafold = [filter_essence(entry)['alphafold'] for entry in data_json]
### Functional site
binding_site = [filter_essence(entry)['binding_site'] for entry in data_json]
active_site = [filter_essence(entry)['active_site'] for entry in data_json]
site = [filter_essence(entry)['site'] for entry in data_json]
dna_binding = [filter_essence(entry)['dna_binding'] for entry in data_json]
zinc_finger = [filter_essence(entry)['zinc_finger'] for entry in data_json]
intra_mem = [filter_essence(entry)['intra_mem'] for entry in data_json]
# Cellular location
intra_mem = [filter_essence(entry)['intra_mem'] for entry in data_json]
topol_dom = [filter_essence(entry)['topol_dom'] for entry in data_json]
trans_mem = [filter_essence(entry)['trans_mem'] for entry in data_json]


# all_pdb_ligands = []
# all_pdb_coenzymes = []
# for entry in data_json:
#     _pdb_ligands = []
#     _pdb = entry['pdb']
#     for k, v in _pdb.items():
#         ligands = v['ligand']
#         if ligands is None:
#             continue
#         for l in ligands:
#             comp_id = l['comp_id']
#             _pdb_ligands.append(comp_id)
#     # Unify list of all ligands in PDBs
#     _pdb_ligands = list(set(_pdb_ligands))
#     # Intersect with enz_list
#     _pdb_coenz = set(_pdb_ligands).intersection(set(enz_list))
    
#     all_pdb_ligands.append(_pdb_ligands)
#     all_pdb_coenzymes.append(list(_pdb_coenz))
    
# def flatten_list(_list):
#     flat_list = []
#     for item in _list:
#         if isinstance(item, list):
#             flat_list.extend(item)  # unpack sublist
#         else:
#             flat_list.append(item)
    
#     return list(set(flat_list))

# join_coenz_ion = []
# for i, ele in enumerate(uni_cofactor):
#     if len(ele) == 0:
#         uni_cofactor[i] = []
#     else:
#         # Extract chebi 
#         ligand_chebi = [cf['chebi'] for cf in ele]
#         # Convert to ligand id 
#         ligand_id = [cofactor_ion[l] for l in ligand_chebi]
#         uni_cofactor[i] = flatten_list(ligand_id)
        
#     # Join two list: all_pdb_coenzymes and uni_cofactor
#     _join = []
#     _join.extend(uni_cofactor[i])
#     _join.extend(all_pdb_coenzymes[i])
    
#     join_coenz_ion.append(list(set(_join)))
    
import pandas as pd 
columns = [
    
    'id', 'protein', 'gene', 'organism', 'uni_cofactor', # Common info.
    'intra_mem', 'topol_dom', 'trans_mem', # Cellular location
    'binding_site','active_site', 'site', 'dna_binding','zinc_finger', # Functional site
    'pdbid', 'all_pdb_ligands', 'all_pdb_coenzymes', 'final_cofactor', 'alphafold']

df = pd.DataFrame(columns=columns)
df = df.assign(
    id=id, protein=protein, gene=gene,organism = organism, uni_cofactor = uni_cofactor,
    intra_mem=intra_mem, topol_dom=topol_dom, trans_mem=trans_mem,
    binding_site=binding_site, active_site=active_site, site=site, dna_binding=dna_binding, zinc_finger=zinc_finger,
    pdbid=pdbid, alphafold=alphafold,
    # pdbid=pdbid, all_pdb_ligands=all_pdb_ligands, all_pdb_coenzymes=all_pdb_coenzymes, final_cofactor=join_coenz_ion, alphafold=alphafold,
)
df.head()

Unnamed: 0,id,protein,gene,organism,uni_cofactor,intra_mem,topol_dom,trans_mem,binding_site,active_site,site,dna_binding,zinc_finger,pdbid,all_pdb_ligands,all_pdb_coenzymes,final_cofactor,alphafold
0,Q72547,Reverse transcriptase/RNaseH,pol,HIV-1,[],[],[],[],"[443, 443, 478, 498, 549]",[],[],[],[],"[2JLE, 3HYF]",,,,
1,P30968,Gonadotropin-releasing hormone receptor,GNRHR,Human,[],[],"[1-38, 59-77, 98-115, 138-164, 185-212, 233-28...","[39-58, 78-97, 116-137, 165-184, 213-232, 282-...",[],[],[],[],[],[7BR3],,,,P30968
2,P33681,T-lymphocyte activation antigen CD80,CD80,Human,[],[],"[35-242, 264-288]",[243-263],[],[],[],[],[],"[1DR9, 1I8L, 7TPS, 8FXW, 8FXZ, 8HXA]",,,,P33681
3,P42081,T-lymphocyte activation antigen CD86,CD86,Human,[],[],"[24-247, 269-329]",[248-268],[],[],[],[],[],"[1I85, 1NCN, 5YXK, 8FXX, 8FXY, 8HXB, 8HXC]",,,,P42081
4,P05106,Integrin beta-3,ITGB3,Human,[],[],"[27-718, 742-788]",[719-741],"[147, 149, 149, 152, 153, 184, 241, 243, 245, ...",[],[],[],[],"[1JV2, 1KUP, 1KUZ, 1L5G, 1M1X, 1M8O, 1MIZ, 1MK...",,,,P05106


In [160]:
df[ (df['final_cofactor'].apply(len) != 0) & \
    (df['trans_mem'].apply(len) == 0)
   ].__len__()

193

In [194]:
df.to_csv('data/cofactor_881.csv', index=False)

In [32]:
import pandas as pd 
uniprotid = 'data/uniprotid.csv'
df = pd.read_csv(uniprotid)
uniprotid = df.id.to_list()
set(uniprotid) - set(ids)

{'A0A0D9MXW1', 'A2VLV3', 'E1FVX6', 'E1G1C3', 'P0A7V8', 'P0A7X3', 'P63231'}