Code to get charge dispersion metrics (structure)

In [1]:
import os
import pandas as pd
import prody as prd
import numpy as np
from collections import defaultdict, Counter
#from multiprocessing import Pool
from pathlib import Path

In [2]:
def get_charge_dispersion(pdb):
    
    try:

        prot = prd.parsePDB(pdb, QUIET=True)
        positive_res = ['HIS', 'ARG', 'LYS']
        negative_res = ['ASP', 'GLU']

        central_atom = prd.pickCentralAtom(prot)
        prd.moveAtoms(prot, to=np.zeros(3))

        positive_distances = [prd.calcDistance(central_atom, prd.pickCentralAtom(res)) for res in prot.iterResidues() if res.getResname() in positive_res]
        negative_distances = [prd.calcDistance(central_atom, prd.pickCentralAtom(res)) for res in prot.iterResidues() if res.getResname() in negative_res]
        positive_dispersion = np.std(positive_distances)
        negative_dispersion = np.std(negative_distances)
        
    except Exception:
        positive_dispersion, negative_dispersion = 0, 0
    
    return positive_dispersion, negative_dispersion
    

In [15]:
#cpus = 80

# path = '/storage/evagsm/nobackup/crystal_dataset/structures/ABB_paired_h/'
# files = [path + f for f in os.listdir(path) if f.endswith('.pdb')]

dataset_name = 'vhh_twist'
#structure_path = '/data/localhost/gordon/TNP_Project/results/debugged_nbb2_results/new_'+dataset_name+'_output/Raw_Model_Outputs/'
#structures = [str(s).split('/')[-1] for s in list(Path(structure_path).glob('**/*.pdb'))]
#files = [structure_path + structure[:-4] + '/' + structure for structure in structures]

structure_path = '/data/localhost/gordon/TNP_Project/RESULTS/greiff_results/reduced_strucs/reduced_'+dataset_name+'/'
structures = os.listdir(structure_path)
files = [structure_path + structure for structure in structures]

# pool = Pool(processes = cpus)
# results = pool.map(get_charge_dispersion, files)
# pool.close()
# pool.join()

In [16]:
len(files)

108

In [17]:
all_results = dict()
for f in files:
    structure = f.split('/')[-1].split('.')[0]
    all_results[structure] = get_charge_dispersion(f)

@> 1845 atoms and 1 coordinate set(s) were parsed in 0.04s.


@> 1925 atoms and 1 coordinate set(s) were parsed in 0.03s.
@> 1750 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1792 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1685 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1782 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1768 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1853 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1863 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1715 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1742 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1700 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1858 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1757 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1766 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1840 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1773 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1773 atoms and 1 coordinate set(s) we

In [18]:
print(len(all_results))

108


In [19]:
df = pd.DataFrame(all_results).T.reset_index()
df.columns = ['SeqID','positive_charge_heterogeneity', 'negative_charge_heterogeneity']
df

Unnamed: 0,SeqID,positive_charge_heterogeneity,negative_charge_heterogeneity
0,seq_TBC-101-VA10040_VHH,2.778050,3.358409
1,seq_D8D85764-3574-KV-cFR2muts,3.149607,3.311790
2,seq_Ozoralizumab_VHH1,2.792280,3.996248
3,seq_D9D88792-5767-N,3.711947,4.525567
4,seq_Brivekimig2_VHH1,3.113307,3.753762
...,...,...,...
103,seq_TBC-103-VA10005_VHH,2.958864,3.210456
104,seq_D9D88792-13450-N-cFR2muts,2.831209,3.694850
105,seq_TBC-107-01_VHH,3.257516,3.318367
106,seq_D9D88792-13450-KV,2.750438,3.692865


In [20]:
#df.insert(0, 'ID', [s.split('/')[-1][:-4] for s in files])
df.to_csv('/data/localhost/gordon/TNP_Project/RESULTS/greiff_results/charge_dispersion/vhh_twist_charge_disp.csv', index=None)