Code for counting free cysteines and cysteine bridges

In [1]:
!pip install prody



In [2]:
import os
import prody as prd
import math
from collections import defaultdict
import itertools
import pandas as pd
#from multiprocessing import Pool
from pathlib import Path

In [3]:
def distance(distances_1, distances_2):
    x1, y1, z1 = distances_1
    x2, y2, z2 = distances_2
    distance = math.sqrt(math.pow(x2 - x1, 2) +
               math.pow(y2 - y1, 2) +
               math.pow(z2 - z1, 2) * 1.0)
    return distance

def ss_bonds(pdb):
    
    try:
        cysteines = defaultdict(list)
        free_cys = 0 
        cys_bridges = 0

        prot = prd.parsePDB(pdb, QUIET=True)
        for residue in prot.iterResidues():
            name_id = str(residue).split()
            name_id. insert(0, str(residue.getChain()).split()[1])
            if name_id[1] == 'CYS':
                cysteines['_'.join(name_id)].extend(list(residue.getAtom('SG').getCoords()))
                #print('_'.join(name_id))

        ss_bonds = []

        for pair in itertools.combinations(cysteines.keys(), r=2):
            if pair[0] != pair[1]:
                d = distance(cysteines[pair[0]], cysteines[pair[1]])
                if d <= 4.0:
                    ss_bonds.append((pair) + (d, ))
                    cys_bridges += 1
                else:
                    free_cys +=1
                    
    except Exception:
        free_cys, cys_bridges = 0, 0

    return free_cys, cys_bridges

In [None]:
dataset_name = 'vhh_tsd'
structure_path = '/reduced_strucs/reduced_'+dataset_name+'/'
#structures = [str(s).split('/')[-1] for s in list(Path(structure_path).glob('**/*.pdb'))]
structures = os.listdir(structure_path)
files = [structure_path + structure for structure in structures]

In [22]:
all_results = dict()
for f in files:
    structure = f.split('/')[-1].split('.')[0]
    all_results[structure] = ss_bonds(f)
    #all_results[f.split('/')[-1].split('.')[1]] = results

@> 1845 atoms and 1 coordinate set(s) were parsed in 0.04s.
@> 1925 atoms and 1 coordinate set(s) were parsed in 0.03s.


@> 1750 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1792 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1685 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1782 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1768 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1853 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1863 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1715 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1742 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1700 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1858 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1757 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1766 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1840 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1773 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1773 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> 1727 atoms and 1 coordinate set(s) we

In [None]:
df = pd.DataFrame(all_results).T.reset_index()
df.columns = ['SeqID','free_cys', 'cys_bridges']
# df.insert(0, 'ID', [s.split('/')[-1][:-4] for s in files])
df

In [None]:
df.to_csv('results_cys.csv', index=False)