Code to get charge dispersion metrics (structure)

In [1]:
import os
import pandas as pd
import prody as prd
import numpy as np
from collections import defaultdict, Counter
#from multiprocessing import Pool
from pathlib import Path

In [2]:
def get_charge_dispersion(pdb):
    
    try:

        prot = prd.parsePDB(pdb, QUIET=True)
        positive_res = ['HIS', 'ARG', 'LYS']
        negative_res = ['ASP', 'GLU']

        central_atom = prd.pickCentralAtom(prot)
        prd.moveAtoms(prot, to=np.zeros(3))

        positive_distances = [prd.calcDistance(central_atom, prd.pickCentralAtom(res)) for res in prot.iterResidues() if res.getResname() in positive_res]
        negative_distances = [prd.calcDistance(central_atom, prd.pickCentralAtom(res)) for res in prot.iterResidues() if res.getResname() in negative_res]
        positive_dispersion = np.std(positive_distances)
        negative_dispersion = np.std(negative_distances)
        
    except Exception:
        positive_dispersion, negative_dispersion = 0, 0
    
    return positive_dispersion, negative_dispersion
    

In [None]:
dataset_name = 'vhh_tsd'
structure_path = '/reduced_strucs/reduced_'+dataset_name+'/'
structures = os.listdir(structure_path)
files = [structure_path + structure for structure in structures]


In [17]:
all_results = dict()
for f in files:
    structure = f.split('/')[-1].split('.')[0]
    all_results[structure] = get_charge_dispersion(f)

@> 1845 atoms and 1 coordinate set(s) were parsed in 0.04s.


@> 1925 atoms and 1 coordinate set(s) were parsed in 0.03s.
@> 1750 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1792 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1685 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1782 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1768 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1853 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1863 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1715 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1742 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1700 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1858 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1757 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1766 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1840 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1773 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1773 atoms and 1 coordinate set(s) we

In [None]:
df = pd.DataFrame(all_results).T.reset_index()
df.columns = ['SeqID','positive_charge_heterogeneity', 'negative_charge_heterogeneity']
df

In [None]:
#df.insert(0, 'ID', [s.split('/')[-1][:-4] for s in files])
df.to_csv('results_charge_disp.csv', index=None)