# Chain labelling
This script reads the maxclust.ndx information and will create a .pdb with all the fibrils labelled in chains based on the fibril (each fibril is a chain) and also creates partial pdbs and trajectories based on chain number.
The plumed part is needed to center the fibrils.

In [1]:
concentration = 13
dt = 1000
nchains = 2000

#### Extraction of the clusters information from maxclust.ndx

In [2]:
import gromacs
from gromacs.fileformats import *
from gromacs import cbook
import string
import numpy as np

%cd make_fibril_2000_13mM_304K_analysis_test/analysis/
# For labelling is not necessary to have the maxclust.ndx of the entire simulation, it is enough just the last frame


# This step is necessary for the script below when different temp coordinates are written
with open('last_frame.ndx', 'r') as infile, open('last_frame_new.ndx', 'w') as outfile:
    data = infile.read()
    data = data.replace("clust ", "clust_")
    outfile.write(data)

### Import the index as uniqueNDX to obtain the clusters information
maxclust_ndx = NDX('last_frame_new.ndx')

### Creation of a dictionary which includes the index names and the atomnumber
maxclust_dict = maxclust_ndx.sizes

#print('Dictionary obtained from last_frame.ndx\n', maxclust_dict)

### Extraction of all the clusters, remove the max_clust (which is duplicated) and make a list of them
fibril_dict = {key:val for key, val in maxclust_dict.items() if val != 85}
fibril_list = list(fibril_dict.keys())
fibril_list.remove('max_clust_')

### Extraction of all the single molecules not part of a fibril and make a list of that
not_fibril_dict = {key:val for key, val in maxclust_dict.items() if val == 85}
not_fibril_list = list(not_fibril_dict.keys())

#print('Dictionary containing the fibril\n', fibril_dict)
print('List containing the fibril\n', fibril_list)
#print(not_fibril_dict)
print(f'There are {len(not_fibril_list)} molecules which are not part of any cluster')


/media/data/emanuele/TTR/greta_test_cutoff_55_ex_3/epsilon_0295_c12_Native2reweight_noN1_gd0/concentration_gradient/make_fibril_2000_13mM_304K_analysis_test/analysis
List containing the fibril
 ['clust_1302']
There are 52 molecules which are not part of any cluster


#### Creation of a new clean index for chain labelling containing only the clusters and all the peptides in one single group at the end

In [3]:
# Vedi a che serve
np.set_printoptions(threshold=sys.maxsize)

# Creation of an empy index
pdb_temp_ndx = open('pdb_temp.ndx', 'w')

check_len = []

# Writing the fibril clusters
for f in fibril_list:
    # This is the name of the cluster
    pdb_temp_ndx.write(f'({f}) \n')
    print(f'{f} added in pdb_temp.ndx')

    # Those are the actual atoms in the cluster
    atom_array =  maxclust_ndx.get(f)
    pdb_temp_ndx.write(str(atom_array) + '\n\n')
    check_len.append(len(atom_array))
    print(f'Containing {len(atom_array)} atoms\n and {len(atom_array)/85} chains')

# Same 
pdb_temp_ndx.write(f'(not_fibril) \n')
for nf in not_fibril_list:
    atom_array =  maxclust_ndx.get(nf)
    pdb_temp_ndx.write(str(atom_array) + '\n')
    check_len.append(len(atom_array))

print(f'Added {len(not_fibril_list)} peptides in pdb_temp.ndx')
pdb_temp_ndx.close()

with open('pdb_temp.ndx', 'r') as infile, open('pdb.ndx', 'w') as outfile:
    data = infile.read()
    data = data.replace("[", " ")
    data = data.replace("]", " ")
    data = data.replace("(", "[")
    data = data.replace(")", "]")
    outfile.write(data)

print('pdb_temp.ndx renamed to pdb.ndx with corrections')
!{'rm pdb_temp.ndx'}

if (sum(check_len)/85) == nchains:
    print(f'\n\n\n\nThere are {sum(check_len)} atoms, equals to {int(sum(check_len)/85)}\nCORRECT\n\n\n\n')
else:
    print('SOMETHING IS VERY WRONG')


clust_1302 added in pdb_temp.ndx
Containing 165580 atoms
 and 1948.0 chains
Added 52 peptides in pdb_temp.ndx
pdb_temp.ndx renamed to pdb.ndx with corrections




There are 170000 atoms, equals to 2000
CORRECT






In [4]:
### Print the values to print into the script
array1 = list(range(0, (len(fibril_list)+1)))
print(f'There are {len(fibril_list)} fibrils in trajectory. The last group contains all peptides not bound\n',array1, '\n')

array2 = list(map(chr, range(97, (97+len(array1)))))
print(f'The chain labels are the following: {array2}\nas "{array2[-1]}" containing the not bound peptides')

There are 1 fibrils in trajectory. The last group contains all peptides not bound
 [0, 1] 

The chain labels are the following: ['a', 'b']
as "b" containing the not bound peptides


#### Script for chain labelling

In [6]:
for a1, a2 in zip(array1, array2):
    print(a1, a2)#.upper()) # Se le vuoi tutte maiuscole

    # For each fibril you want to obtain the pdb and the esp files.
    string_pdb = f'echo -e {a1} \\n | gmx_mpi editconf -f prod_{nchains}-{concentration}mM.tpr -n pdb.ndx -label {a2} -o clust{a1}.pdb'
    string_esp = f'echo -e {a1} \\n | gmx_mpi editconf -f prod_{nchains}-{concentration}mM.tpr -n pdb.ndx -label {a2} -o clust{a1}.esp'
    string_xtc = f'echo -e {a1} \\n | gmx_mpi trjconv -dt {dt} -f NOPBC_prod_{nchains}-{concentration}mM-dt{dt}.xtc -n pdb.ndx -o clust{a1}.xtc'
    print(string_pdb)
    !{string_pdb}
    print(string_esp)
    !{string_esp}
    print(string_xtc)
    !{string_xtc}

    # Here the script takes the atom number from the esp file and paste into a temporary file    
    string_sed = f"sed 's/{{//' clust{a1}.esp | awk '{{print $1+1}}' > numero{a1}"
    print(string_sed)
    !{string_sed}

    # Here you add a line with "1" to align the numbers with the correct atom to paste into the pdb
    string_sed2 = f"sed -i '1s/^/1\\n/' numero{a1}"
    print(string_sed2)
    !{string_sed2}

    # Here you paste the number in the pdb file and save into another temp file
    string_paste = f"paste numero{a1} clust{a1}.pdb > chain{a1}"
    print(string_paste)
    !{string_paste}

    # This step append all the pdbs and extract only the atoms.
    # Also you sort the atoms based on their real atom number and save into a temporary pdb file full with the fibril and not-fibril

string_cat = "cat chain* | grep ATOM | sort -k1 -g > temp_label-chain.pdb"
print(string_cat)
!{string_cat}
string_rm = "rm numero* chain* clust*.esp"
print(string_rm)
!{string_rm}

# In this step you delete the first column which is the atom number from the esp file and save into the final pdb file

string_cut = "cut -f 1 --complement temp_label-chain.pdb > label-chain.pdb"
print(string_cut)
!{string_cut}

string_rm2 = "rm temp_label-chain.pdb"
print(string_rm2)
!{string_rm2}


string_mkdir = "mkdir clust_pdbs"
print(string_mkdir)
!{string_mkdir}

string_mv = "mv clust*.pdb clust_pdbs/"
print(string_mv)
!{string_mv}

string_mv2 = "mv clust*.xtc clust_pdbs/"
print(string_mv2)
!{string_mv2}

!{'rm \#*'}

0 a
echo -e 0 \n | gmx_mpi editconf -f prod_2000-13mM.tpr -n pdb.ndx -label a -o clust0.pdb
Invalid MIT-MAGIC-COOKIE-1 key                :-) GROMACS - gmx editconf, 2020.3-MODIFIED (-:

                            GROMACS is written by:
     Emile Apol      Rossen Apostolov      Paul Bauer     Herman J.C. Berendsen
    Par Bjelkmar      Christian Blau   Viacheslav Bolnykh     Kevin Boyd    
 Aldert van Buuren   Rudi van Drunen     Anton Feenstra       Alan Gray     
  Gerrit Groenhof     Anca Hamuraru    Vincent Hindriksen  M. Eric Irrgang  
  Aleksei Iupinov   Christoph Junghans     Joe Jordan     Dimitrios Karkoulis
    Peter Kasson        Jiri Kraus      Carsten Kutzner      Per Larsson    
  Justin A. Lemkul    Viveca Lindahl    Magnus Lundborg     Erik Marklund   
    Pascal Merz     Pieter Meulenhoff    Teemu Murtola       Szilard Pall   
    Sander Pronk      Roland Schulz      Michael Shirts    Alexey Shvetsov  
   Alfons Sijbers     Peter Tieleman      Jon Vincent      Teemu 

# PLUMED