# Chicken and egg in molecular metadynamics

### Create protein folding landmarks from scratch


The whole workflow is implemented as a single notebook to allow batch run. It is structured to these major steps:

1. generate random landmarks by twisting the input structrure backbone dihedrals and steep-descend minimization of the result to avoid side chain clash etc.
1. compute low-dimensional embedding (isomap) of the landmarks to create collective variables
1. train a neural networks to estimate the colvars from the structure, implement the resulting network as plumed input
1. run both vanilla molecular dynamics and the metadynamics to compare the trajectories


In [None]:
# The input

# pdbfile = "2f21.pdb"
pdbfile="1L2Y.pdb"

# Or PDB download:
# pdbid = '1l2y'

In [None]:
# we need all this fancy shit
import anncolvar

import os
import shutil
from contextlib import redirect_stdout
import re

import concurrent.futures

import numpy as np
import math
from scipy.sparse import coo_matrix

from pyDOE import lhs
from scipy.sparse.csgraph import shortest_path
from sklearn.manifold import MDS

import PeptideBuilder as pb
import Bio.PDB as pdb
import Bio.SeqUtils as sequtil

import mdtraj as md

import matplotlib.pyplot as plt
import nglview as nv

from xvg import read_xvg

In [None]:
# set the defaults 
# for batch processing, those can be set in config.py so that the notebook itself needn't be modified

# number of steps to twist a dihedral when generating landmarks
nsteps = 12

# number of iterations of landmark generation; alltogether niter * nsteps random conformers are generated
niter = 200

# number of steps of the production MD run
# md.mdp template sets 2 fs step
mdsteps = 100000  # 200 ps just to test it works, production runs are expected to be extended sepearately

# bounding box size to add for mininimization and 
minbox = 1.5
mdbox = 1.5


# count available cores -- check the output and set ncores to something else if necessary

# TODO: capture PBS settings when running in batch

if os.environ.get('OMP_NUM_THREADS') is None:
    ncores = int(os.popen('./ncores.sh').read())
    print('OMP_NUM_THREADS not set, using all (%d) available cores' % ncores)
else:
    ncores = int(os.environ.get('OMP_NUM_THREADS'))
    print('Using OMP_NUM_THREADS = %d cores' % ncores)


# XXX: poor man approach, expected to be tuned in config.py
ntomp = 4
ntmpi = ncores // ntomp
try:
    exec(open('config.py').read())
    !cat config.py
except FileNotFoundError:
    print('config.py not found, hope it\'s OK')


In [None]:
def scale(cores):
    !kubectl scale deployment.apps/chicken-and-egg{os.environ.get('K8S_LABEL')}-placeholder --replicas={cores}

In [None]:
# Load a PDB file, store it locally, create a matching workdir

try:
    workdir
    raise Exception("This cell should be run only once (workdir = %s)" % workdir)
except NameError:
    pass

basedir=os.getcwd()


# load the PDB file, create workdir, set global variables to be used later
do_load = False
try:
    pdbid
    do_load = True
    pdbfile = pdbid + '.pdb'

except NameError:
    pdbid = os.path.splitext(os.path.basename(pdbfile))[0]
    
workdir=os.path.join(basedir,pdbid)
gmx = f"{basedir}/gmx-k8s -w {pdbid}"
minim = f"{basedir}/minim-k8s -w {pdbid}"
 
if not os.path.exists(workdir):
    os.mkdir(workdir)

os.chdir(workdir)


if do_load:
    pdbl = pdb.PDBList()
    pdbl.retrieve_pdb_file(pdbid,file_format='pdb')
    shutil.move(pdbid[1:3] + "/pdb" + pdbid + ".ent", pdbid + ".pdb")
else:
    shutil.copy(os.path.join(basedir,pdbfile),os.path.join(workdir,pdbfile))

## 1 Generate landmarks by random twisting PDB structure

### 1.1 Initial preprocessing and visual check


In [None]:
# preprocess the file with gromacs to get consistent atom naming and numbering
!{gmx} pdb2gmx -f {pdbfile} -o {pdbid}-new.pdb -water tip3p -ff amber94 -ignh


In [None]:
!mv -f {pdbid}-new.pdb {pdbfile}

m = md.load(pdbfile)
heavy_idx = m[0].top.select("element != H")
heavy_atoms = len(heavy_idx)
 

In [None]:
# inspect the loaded file
# it must be a sane structure, no missing heavy atoms and/or hydrogens etc., 
# suitable as the starting point of usual MD protocol
os.chdir(basedir)
v = nv.NGLWidget()
v.add_component(os.path.join(workdir,pdbfile))
v.clear()
v.add_representation('cartoon', selection='all')

os.chdir(workdir)
v

### 1.2 Generate randomly twisted conformations

$\phi$ and $\psi$ backbone dihedral angles of all but first and last residue of the loaded structure are twisted randomly.

Systematic approach (e.g. 30 degree sampling of all angles) would yield too many conformations.
Instead we use random latin hypercube sampling to get uniform coverage of all values of all angles.

Empirically, running 10 times no. of residues (`niter` parameter bellow) seems to be sufficient to cover the whole conformational space while keeping number of landmarks still reasonable.

Expect approx. 1 s per 300 residues. It is worth to inspect some of the outputs visually (the following cell).

In [None]:
p = pdb.PDBParser()
instruct = p.get_structure('in',pdbfile)

# XXX: assuming one model and one chain, the method would be rather weird for more

resl = list(map(lambda r: sequtil.seq1(r.get_resname()),instruct.get_residues()))
nres = len(resl)

out='conf%d.pdb'

# make it really reproducible
np.random.seed(123456789)
    
def random_twist(itrn):    
    np.random.seed(itrn + 123456789)

    phi = lhs(nres - 2, nsteps)
    psi = lhs(nres - 2, nsteps)
    outf = pdb.PDBIO()

    for s in range(nsteps):
        first = pb.Geometry.geometry(resl[0])
        struct = pb.initialize_res(first)
        
        for r in range(1,nres-1):
            if resl[r] == 'P':
                pb.add_residue(struct,resl[r])
            else:
                pb.add_residue(struct,resl[r],phi[s][r-1]*360,psi[2][r-1]*360)
                
        pb.add_residue(struct,resl[nres-1])
            
        fn = out % (itrn * nsteps + s + 1)
        outf.set_structure(struct)
        outf.save(fn)
        
scale(ncores)
# XXX: better with hyperthreading but we don't want to eat up 2x cores when running in batch mode
with concurrent.futures.ProcessPoolExecutor(max_workers=ncores) as executor:
    for _ in executor.map(random_twist,range(niter)):
        pass

scale(0)

In [None]:
tr = md.load([ "conf%d.pdb" % i for i in range(1,nsteps*niter+1)])
idx=tr[0].top.select("name CA")
tr.superpose(tr[0],atom_indices=idx)



In [None]:
v=nv.show_mdtraj(tr)
v.clear()
v.add_representation("licorice")
v

In [None]:
tr.save('premin.xtc')

### 1.3 Minimize the generated structures

Run Gromacs steepest descend energy minimization in vacuo on all the generated structures. This is sufficient to fix colliding sidechains etc. while not changing the backbone dihedrals, hence preserving the conformational space coverage.

Expect approx. 25 structures per minute per core in case of small protein like trpcage (1L2Y).

In [None]:
# the most likely minimization parameters to change; rest is in the template file
minim_mdp = '''
emtol       = 500.0        ; Stop minimization when the maximum force is lower (kJ/mol/nm)
emstep      = 0.05          ; Minimization step size
nsteps      = 500         ; Maximum number of (minimization) steps to perform
'''

template = os.path.join(basedir,'minim.mdp.template')

!cp {template} minim.mdp
f=open('minim.mdp','a')
f.write(minim_mdp)
f.close()

!bash {minim} -n {ncores} -b {minbox}

In [None]:
# filter the results to the reasonable ones only

conflist = []
frames = []
energies = []
maxenergy = 1e8

for i in range(1,nsteps*niter+1):
    try:
        with open('conf%d.minen' % i) as ef:
            l = ef.readline()
            _,energy = l.split()
            energy = float(energy)
    except FileNotFoundError:
        print(i, "not found, something went wrong")
        continue
        
    fn = "conf%d-min.gro" % i
    if os.path.isfile(fn):
        one = md.load(fn)
        heavy_idx = one[0].top.select("element != H")
        one.atom_slice(heavy_idx,inplace=True)
        if one.n_atoms == heavy_atoms:
            frames.append(one)
        else:
            print("%d number of heavy atoms (%d) should be %d, ignoring" % (i,one.n_atoms, heavy_atoms))
            continue
    else:
        print(fn, "not found, ignoring")
        continue
        
    if energy < maxenergy:
        conflist.append(i)
        energies.append(energy)
    else:
        print(i,"energy too high:", energy, 'ignoring')
        
print('remaining conformers', len(conflist))

### 1.4 Inspect the results

Minimized structures are merged into virtual trajectory and displayed as animation.

Histograms of their radius of gyration and energies (following cells) gives some evidence on conformational space coverage.

In [None]:
tr=md.join(frames)
tr[0].center_coordinates()
idx=tr[0].top.select("name CA")
tr.superpose(tr[0],atom_indices=idx)

In [None]:
v=nv.show_mdtraj(tr,gui=False)
v.clear()
v.add_representation("licorice")
v

In [None]:
rgs=md.compute_rg(tr)

plt.rcParams.update({'font.size': 17})
plt.figure(figsize=(10,5))
plt.hist(rgs,200)
plt.xlabel('Radius of gyration (nm)')
plt.ylabel('# of samples')
plt.savefig('minim-rg.pdf')
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.hist(energies,200)
plt.yscale('log')
plt.ylabel('# of samples')
plt.xlabel('Energy (kJ/mol)')
plt.xticks([0,5000,10000,15000,20000])
plt.savefig('minim-energ.pdf')
plt.show()

In [None]:
# save superposed landmarks as Gromacs trajectory
tr.save_pdb('landmarks.pdb')
tr.save_xtc('landmarks.xtc')
tr[0].save_pdb('landmark1.pdb')

In [None]:
# cluster the minimized landmarks
# the purpose is checking density of sampling the conformational space
# the number of clusters should roughly match the number of landmarks, too few (dozens) clusters 
# indicates the minimization went too far

!{gmx} -i 2,1 cluster -s landmark1.pdb -f landmarks.xtc -o clusters.xpm

## 2. Compute isomap projection of the landmarks

In [None]:
# number of nearest neighbours to consider (aka _k_)
neighs = 5
# targed no. of dimensions 
dims = 2

try:
    tr
except NameError:
    tr = md.load('landmarks.pdb')

In [None]:
scale(ncores)
# nconf = niter * nsteps
nconf = len(tr)

# compute all-to-all RMSD and select _k_ closest neighbours
row=[]
col=[]
dat=[]

for i in range(nconf):
    d = md.rmsd(tr,tr,frame=i)
    d[range(i+1)] = np.inf
    for _ in range(neighs):
        j = np.argmin(d)
        if d[j] < np.inf:
            row.append(i)
            col.append(j)
            dat.append(d[j])
            row.append(j)
            col.append(i)
            dat.append(d[j])
            d[j] = np.inf

# store results in sparse matrix
dist = coo_matrix((dat,(row,col)),shape=(nconf,nconf)) 
scale(0)

# check sanity
print("conformations (original dimensions): ", nconf)
print("non-zero distances: ", dist.getnnz())

In [None]:
# isomap itself: compute shortest paths in the k-neighbours graph, 
# and multi-dimensional scaling on the resulting all-to-all distances
scale(ncores)
sp = shortest_path(dist,directed=False)
mds = MDS(n_components=dims,dissimilarity='precomputed',n_jobs=ncores)
emb = mds.fit_transform(sp)
scale(0)

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(*emb.transpose(),marker='.')
plt.show()

In [None]:
### XXX: assumes negative min and positive max
embmin=np.min(emb,axis=0)*3.
embmax=np.max(emb,axis=0)*3.

In [None]:
# save collective variables
np.savetxt('colvar.txt',emb)

## 3 Prepare metadynamics

### 3.1 Train the neural net

Create artificial neural network and train it to produce the above isomap embedding from superposed heavy atom coordinates. The ANN is encoded in `plumed.dat` to be used by metadynamic run later.

Technically, this is done for both coordinates of the embedding independently, the resulting `plumed.dat` files are merged.

Uses [Anncolvar](https://github.com/spiwokv/anncolvar).

In [None]:
try:
    emb
except NameError:
    emb=np.loadtxt('colvar.txt')
    embmin=np.min(emb,axis=0)*3.
    embmax=np.max(emb,axis=0)*3.

In [None]:
# run anncolvar twice, for each isomap coordinate independently
epochs = 500

# anncolvar defaults
nlayers = 1
layers = [32, 0, 0]
# actfun = ['sigmoid','linear','linear']
# XXX: nothing else supported
actfun = ['tanh']*3

optim = 'adam'
loss = 'mean_squared_error'
batch = 256

first = md.load('landmark1.pdb')
rg = md.compute_rg(first)

# XXX magic -- seems to be safe
shift = rg[0] * 5
box = shift * 2

one = md.load(pdbfile)
heavy_idx = one[0].top.select("element != H")

!{gmx} editconf -f landmark1.pdb -o landmark1-box.pdb -translate {shift} {shift} {shift} -box {box} {box} {box} -c >editconf.log 2>&1

with open('landmark1-box.pdb') as boxf, open('reference.pdb','w') as ref:
    lines = boxf.readlines()
    for l in lines:
        if l[:4] == 'ATOM':
            newi = heavy_idx[int(l[4:11])-1]+1
            ref.write('ATOM%7d' % newi)
            ref.write(l[11:])
        else:
            ref.write(l)

scale(ncores)
# XXX: too much stdout 
for col in [1,2]:
    with open("anncolvar-%d.log" % col,"w") as log, redirect_stdout(log):
        anncolvar.anncollectivevariable('landmarks.xtc','reference.pdb','colvar.txt',col,
                                    box,box,box,.1,0,0,
                                    nlayers,*layers,
                                    *actfun,
                                    optim,loss,epochs,batch,
                                    '','',fannfile='cv%d-plumed.dat' % col,
                                    plumedfile='old-cv%d-plumed.dat' % col)
            
scale(0)
!tail editconf.log anncolvar-[12].log

In [None]:
# merge plumed[12].dat from the previous cell

onlyone = ['WHOLEMOLECULES', 'FIT_TO_TEMPLATE']

# XXX entirely
with open('plumed-ann.dat','w') as fout:
    for col in [1,2]:
        with open('cv%d-plumed.dat' % col) as fin:
            for line in fin:
                '''not necessary, reference.pdb is already renumbered
                w = line.split()
                if w[1] == 'POSITION':
                    a = w[2].split('=')
                    line = w[0] + (' POSITION ATOM=%d ' % (heavy_idx[int(a[1])-1]+1)) + ' '.join(w[3:]) + '\n'
                '''
                shutup = False
                for o in onlyone:
                    if line[:len(o)] == o:
                        if col != 1:
                            shutup = True
                if shutup:
                    continue
                    
                if line[:5] == 'PRINT':
                    continue
                            
                if re.match('[pl][0-9_rxyz]',line) or line[:4] == 'ARG=':
                    line = re.sub('[pl][0-9_rxyz]','cv%d_\g<0>' % col,line)
                    fout.write(line)
                elif line == 'LABEL=ann\n':
                    fout.write('LABEL=ann_cv%d\n' % col)
                else:
                    fout.write(line)
                
  
    # XXX: hardcoded
    fout.write('PRINT ARG=ann_cv1.node-0,ann_cv2.node-0 STRIDE=100 FILE=COLVAR-ann\n')
    fout.write('METAD ARG=ann_cv1.node-0,ann_cv2.node-0 SIGMA=0.1,0.1 HEIGHT=1.0 FILE=HILLS-ann PACE=1000 BIASFACTOR=15 TEMP=300 LABEL=restraint')
    fout.write(' GRID_MIN=%f,%f GRID_MAX=%f,%f\n' % (*embmin,*embmax)) 
    
# XXX: hack plumed.dat, FIT_TO_TEMPLATE TYPE=OPTIMAL is broken
!sed '/^FIT_TO_TEMPLATE/s/TYPE=OPTIMAL/TYPE=SIMPLE/' plumed-ann.dat >plumed-ann.dat.$$ && mv plumed-ann.dat.$$ plumed-ann.dat

### 3.2 Prepare PCV with the same landmarks

In [None]:
cvs = np.loadtxt('colvar.txt')
i=0
start=True
with open('landmarks.pdb') as inp, open('landmarks-pcv.pdb','w') as out:
    for line in inp:
        if line[:5] == 'MODEL':
            if i > 0: 
                out.write('END\n')
                
            out.write("REMARK X=%f Y=%f\n" % tuple(cvs[i]))
            i += 1
        if line[:4] == 'ATOM':
            out.write(line)
    out.write('END\n')

           
!grep WHOLEMOLECULES plumed-ann.dat >plumed-pcv.dat
with open('plumed-pcv.dat','a') as plmd:
    plmd.write("p1: PROPERTYMAP REFERENCE=landmarks-pcv.pdb PROPERTY=X,Y LAMBDA=50.0 NEIGH_SIZE=50 NEIGH_STRIDE=50 EPSILON=0.01\n")
    plmd.write('METAD ARG=p1.X,p1.Y SIGMA=0.1,0.1 HEIGHT=1.0 FILE=HILLS-pcv PACE=1000 BIASFACTOR=15 TEMP=300 LABEL=restraint')
    plmd.write(' GRID_MIN=%f,%f GRID_MAX=%f,%f\n' % (*embmin,*embmax)) 
    plmd.write('PRINT ARG=p1.X,p1.Y,p1.zzz,restraint.bias STRIDE=100 FILE=COLVAR-pcv FMT=%8.4f\n')

## 4. Run MD

Run quite standard molecular dynamics protocol, adapted from [Lysosome tutorial](http://www.mdtutorials.com/gmx/lysozyme/index.html), i.e. solvate, add counterions, minimize, equilibrate, and run production.

Preparation phases are common, then we run vanilla and metadynamic simulations to compare the results.

In [None]:
# 2 fs steps
mdsteps = 500*1000*200

### 4.1 Prepare, minimize, and equilibrate

In [None]:
# elementary preparation

# XXX hardcoded defaults for the time being, replace with template eventually

os.chdir(basedir)
!cp ions.mdp minim-sol.mdp {workdir}
os.chdir(workdir)

!{gmx} pdb2gmx -f {pdbfile} -o {pdbid}.gro -water tip3p -ff amber94 -ignh -p {pdbid}.top && \
{gmx} editconf -f {pdbid}.gro -o {pdbid}-box.gro -c -d {mdbox} -bt dodecahedron && \
{gmx} solvate -cp {pdbid}-box.gro -cs spc216.gro -o {pdbid}-solv.gro -p {pdbid}.top && \
{gmx} grompp -f ions.mdp -c {pdbid}-solv.gro -p {pdbid}.top -o ions.tpr && \
{gmx} -i 13 genion -s ions.tpr -o {pdbid}-ions.gro -p {pdbid}.top -pname NA -nname CL -neutral

In [None]:
# minimize with steepest descend
!{gmx} grompp -f minim-sol.mdp -c {pdbid}-ions.gro -p {pdbid}.top -o em.tpr &&\
unset OMP_NUM_THREADS && {gmx} -n {ntmpi} mdrun -v -deffnm em -ntomp {ntomp} -pin on &&\
{gmx} -i 10 energy -f em.edr -o em.xvg

In [None]:
x,y=read_xvg(os.path.join(workdir,'em.xvg'))

plt.figure(figsize=(15,5))
plt.plot(x,y)
plt.grid()
plt.xlabel('step')
plt.ylabel('potential (kJ/mol)')
plt.title('Energy minimization')

plt.show()

In [None]:
# isothermal - isochoric equilibration
!cp {basedir}/nvt.mdp .

!{gmx} grompp -f nvt.mdp -c em.gro -r em.gro -p {pdbid}.top -o nvt.tpr && \
unset OMP_NUM_THREADS && {gmx} -n {ntmpi} mdrun -ntomp {ntomp}  -pin on -deffnm nvt && \
{gmx} -i 16 energy -f nvt.edr -o temp.xvg

In [None]:
x,y=read_xvg(os.path.join(workdir,'temp.xvg'))

plt.figure(figsize=(15,5))
plt.plot(x,y)
plt.grid()
plt.xlabel('time (ps)')
plt.ylabel('temperature (K)')
plt.title('isothermal-isochoric equilibration')
plt.show()

In [None]:
# isothermal - isobaric equilibration
!cp {basedir}/npt.mdp .

!{gmx} grompp -f npt.mdp -c nvt.gro -r nvt.gro -t nvt.cpt -p {pdbid}.top -o npt.tpr && \
unset OMP_NUM_THREADS && {gmx} -n {ntmpi} mdrun -ntomp {ntomp} -pin on -deffnm npt && \
{gmx} -i 18 energy -f npt.edr -o press.xvg && \
{gmx} -i 24 energy -f npt.edr -o dens.xvg

In [None]:
xp,yp=read_xvg(os.path.join(workdir,'press.xvg'))
xd,yd=read_xvg(os.path.join(workdir,'dens.xvg'))

plt.figure(figsize=(15,8))
plt.subplot(211)
plt.plot(xp,yp)
plt.title('isothermal-isobaric equilibration')
plt.grid()
#plt.xlabel('time (ps)')
plt.ylabel("pressure (bar)")


plt.subplot(212)
plt.xlabel('time (ps)')
plt.ylabel('density (kg/m3)')
plt.grid()
plt.plot(xd,yd)
plt.show()

### 4.2 Run vanilla MD

In [None]:
!cp {basedir}/md.mdp.template md.mdp
with open('md.mdp','a') as mdp:
    mdp.write("nsteps = %d\n" % mdsteps)

!{gmx} grompp -f md.mdp -c npt.gro -t npt.cpt -p {pdbid}.top -o md-vanilla.tpr && \
unset OMP_NUM_THREADS && {gmx} -n {ntmpi} mdrun -ntomp {ntomp} -pin on -deffnm md-vanilla

### 4.3 Run Anncolvar metadynamics

In [None]:
!cp {basedir}/md.mdp.template md.mdp

# mdsteps=1000000
with open('md.mdp','a') as mdp:
    mdp.write("nsteps = %d\n" % mdsteps)

!{gmx} grompp -f md.mdp -c npt.gro -t npt.cpt -p {pdbid}.top -o md-ann.tpr 

In [None]:
!unset OMP_NUM_THREADS && {gmx} -n {ntmpi} mdrun -ntomp {ntomp} -pin on -deffnm md-ann -plumed plumed-ann.dat

### 4.4 Run PCV metadynamics

In [None]:
!{gmx} grompp -f md.mdp -c npt.gro -t npt.cpt -p {pdbid}.top -o md-pcv.tpr

In [None]:
!unset OMP_NUM_THREADS && {gmx} -n {ntmpi} mdrun -ntomp {ntomp} -pin on -deffnm md-pcv -plumed plumed-pcv.dat

### 4.5 Extend trajectories arbitrarily

You can skip this section entirely, and go to 5. to examine results of the short trajectories if it is the purpose. But typically longer trajectories must be computed.

This is commented out now, we run computation in K8s on strong enough resources.


In [None]:
# # Backup short trajectories first
# for suffix in ['vanilla','ann','pcv']:
#     !cp md-{suffix}.tpr md-{suffix}-short.tpr
#     !cp md-{suffix}.xtc md-{suffix}-short.xtc
#     !cp md-{suffix}.edr md-{suffix}-short.edr
#     !cp COLVAR-{suffix} COLVAR-{suffix}-short
#     !cp HILLS-{suffix} HILLS-{suffix}-short

In [None]:
# wanted=100000000 # in steps

# !{gmx} convert-tpr -s md1.tpr -o md1-long.tpr -nsteps {wanted} && mv md1-long.tpr md1.tpr
# !{gmx} convert-tpr -s md2.tpr -o md2-long.tpr -nsteps {wanted} && mv md2-long.tpr md2.tpr
# !{gmx} convert-tpr -s md3.tpr -o md3-long.tpr -nsteps {wanted} && mv md3-long.tpr md3.tpr
# !echo RESTART >plumed-ann-restart.dat && cat plumed-ann.dat >>plumed-ann-restart.dat
# !echo RESTART >plumed-pcv-restart.dat && cat plumed-pcv.dat >>plumed-pcv-restart.dat

Transfer the workdir to a more powerful node, and run re-run the MD computation arbitrarily to the desired trajectory lenght.

This involves grabbing files md[123].\*, plumed-restart.dat, COLVAR, HILLS, and reference.pdb from the working directory, and running commands in the following cells on the powerful node (with appropriate {ntmpi} and {ntomp} settings). The script qsub-extend.sh from the same repo can be used to submit to PBS (after editing to match your environment).

Alternatively, just uncomment the following cells and run them. It will take quite long time to finish. If the computation gets killed, just run it again, it restarts from a checkpoint.

In [None]:
# !unset OMP_NUM_THREADS && {gmx} -n {ntmpi} mdrun -ntomp {ntomp} -pin on -deffnm md1 -cpi md1.cpt

In [None]:
# !unset OMP_NUM_THREADS && {gmx} -n {ntmpi} mdrun -ntomp {ntomp} -pin on -deffnm md2 -cpi md2.cpt -plumed plumed-ann-restart

In [None]:
# !unset OMP_NUM_THREADS && {gmx} -n {ntmpi} mdrun -ntomp {ntomp} -pin on -deffnm md2 -cpi md2.cpt -plumed plumed-pcv-restart

# 5 Analyze results

In [None]:
def pbc_and_fit(base):
    xtc = base + ".xtc"
    pbc = base + "-pbc.xtc"
    !{gmx} -i 1 trjconv -f {xtc} -s npt.gro -pbc nojump -o {pbc} 2>&1 | tail -100
    tr = md.load_xtc(pbc,top=pdbid+'.gro')
    idx=tr[0].top.select("name CA")
    tr.superpose(tr[0],atom_indices=idx)
    return tr

def plot_rmsd_rgyr(tr):
    rmsd = md.rmsd(tr,tr)
    rg = md.compute_rg(tr)
    plt.figure(figsize=(15,8))
    plt.subplot(211)
    plt.plot(rmsd)
    plt.grid()
    plt.ylabel('RMSD wrt. frame 0 (nm)')
    plt.subplot(212)
    plt.plot(rg)
    plt.grid()
    plt.ylabel('Radius of gyration (nm)')
    plt.xlabel('time (10 ps steps)')
    plt.show()
    
def plot_stability(tr,part=.7):
    idx=tr.top.select("protein and element != H")
    heavy = tr.atom_slice(idx,inplace=False)

    xyz = np.reshape(heavy.xyz,(heavy.xyz.shape[0],heavy.xyz.shape[1]*3))
    xyz_avg = np.average(xyz,axis=0)
    xyz -= xyz_avg

    cor = np.matmul(np.transpose(xyz),xyz)
    cor /= xyz.shape[0]

    full = np.abs(np.sort(np.linalg.eigvalsh(cor)))

    sum = np.sum(full)
    full /= sum

    num = len(full)
    full = np.flip(full[-num:])
    full = np.cumsum(full)

    part_heavy = heavy[:int(len(heavy) * part)]
    xyz = np.reshape(part_heavy.xyz,(part_heavy.xyz.shape[0],part_heavy.xyz.shape[1]*3))
    xyz_avg = np.average(xyz,axis=0)
    xyz -= xyz_avg

    cor = np.matmul(np.transpose(xyz),xyz)
    cor /= xyz.shape[0]

    start = np.abs(np.sort(np.linalg.eigvalsh(cor)))

    sum = np.sum(start)
    start /= sum

    start = np.flip(start[-num:])
    start = np.cumsum(start)

    show = np.count_nonzero(full < .95)

    plt.figure(figsize=((15,8)))
    plt.plot(range(show),full[:show])
    plt.plot(range(show),start[:show])
    plt.grid()
    plt.legend(['full','initial %d%%' % int(part*100)])
    plt.ylabel("cummulative eigenvalues")
    plt.show()
    
    
def plot_energy(base):
    edr = base + ".edr"
#    !echo 11 | {gmx} energy -f {edr} 2>&1 | tail
    !{gmx} -i 11 energy -f {edr} 2>&1 | tail
    energ = read_xvg('energy.xvg')
    
    plt.figure(figsize=(15,8))
    plt.plot(*energ)
    plt.grid()
    plt.ylabel('Potential energy (kJ/mol)')
    plt.xlabel('time (ps)')
    plt.show()
    return energ
    
def plot_colvar(suffix,stride=20):
    lms=np.loadtxt('colvar.txt').T
    cv = np.transpose(np.loadtxt('COLVAR-' + suffix)[::stride])
    cv2 = cv[1:3]
    plt.figure(figsize=(12,12))
    plt.scatter(*cv2,c=range(cv2.shape[1]),marker='.',cmap=plt.get_cmap('rainbow'))
    plt.colorbar()
    plt.scatter(*lms,c='black',marker='+')
    plt.show()
    

## 5.1 Unbiased MD

In [None]:
base1 = "md-vanilla"

In [None]:
tr1=pbc_and_fit(base1)

In [None]:
v = nv.show_mdtraj(tr1,gui=False)
v

In [None]:
plot_rmsd_rgyr(tr1)

In [None]:
energ1=plot_energy(base1)

In [None]:
plot_stability(tr1)

## 5.2 Biased MD with Anncolvar

In [None]:
base2 = "md-ann"
#base2 = "md-fann"

In [None]:
tr2 = pbc_and_fit(base2)

In [None]:
v = nv.show_mdtraj(tr2,gui=False)
v

In [None]:
plot_rmsd_rgyr(tr2)

In [None]:
energ2 = plot_energy(base2)

In [None]:
plot_stability(tr2)

In [None]:
plot_colvar('ann')

## 5.3 Biased MD with PCV

In [None]:
base3='md-pcv'
tr3 = pbc_and_fit(base3)

In [None]:
v = nv.show_mdtraj(tr3,gui=False)
v

In [None]:
plot_rmsd_rgyr(tr3)

In [None]:
energ3=plot_energy(base3)

In [None]:
plot_stability(tr3)

In [None]:
plot_colvar('pcv')

## 5.4 Progress alltogether

In [None]:
rmsd1 = md.rmsd(tr1,tr1)
rg1 = md.compute_rg(tr1)
rmsd2 = md.rmsd(tr2,tr2)
rg2 = md.compute_rg(tr2)
rmsd3 = md.rmsd(tr3,tr3)
rg3 = md.compute_rg(tr3)

# XXX: same lenth expected
l = len(rmsd1)
l8 = l // 8;
ticks = np.arange(0,l,l8)
labels = ticks / 100

plt.rcParams.update({'font.size': 14})
_,ax = plt.subplots(3,1,figsize=(15,8))
#plt.subplot(311)
ax[0].plot(rmsd1)
ax[0].plot(rmsd2)
ax[0].plot(rmsd3)
ax[0].grid()
ax[0].set_ylabel('RMSD (nm)')
ax[0].set_xticks(ticks)
ax[0].set_xticklabels(labels)
ax[0].legend(['unbiased','ANN','PCV'])
#plt.subplot(312)
ax[1].plot(rg1)
ax[1].plot(rg2)
ax[1].plot(rg3)
ax[1].grid()
ax[1].set_ylabel('R. gyr. (nm)')
ax[1].set_xticks(ticks)
ax[1].set_xticklabels(labels)
#plt.subplot(313)
ax[2].plot(energ1[1])
ax[2].plot(energ2[1])
ax[2].plot(energ3[1])
ax[2].grid()
ax[2].set_ylabel('Epot (kJ/mol)')
ax[2].set_xticks(ticks)
ax[2].set_xticklabels(labels)
ax[2].set_xlabel('time (ns)')
plt.savefig('graphs.pdf')
plt.show()

## 5.5 Crosscheck of both CV calculations

Use plumed driver to calculate CVs with PCV CV definition on ANN trajectory and vice versa.
ANN tends to explore wider regions, beyond the space covered by landmarks (should be visible on the maps above). 
The reason is that PCV approach zero when farther from any landmark. 
This should be visible as concentrating these segments of ANN trajectory (color) around (1,1) when evaluated on PCV.

On the contrary, PCV trajectories should look similar on ANN CVs.

In [None]:
plumed=f"{gmx} -p plumed"

!grep WHOLEMOLECULES plumed-ann.dat >plumed-pcv-driver.dat
with open('plumed-pcv-driver.dat','a') as plmd:
    plmd.write("p1: PROPERTYMAP REFERENCE=landmarks-pcv.pdb PROPERTY=X,Y LAMBDA=50.0 NEIGH_SIZE=50 NEIGH_STRIDE=1\n")
    plmd.write('PRINT ARG=p1.X,p1.Y,p1.zzz STRIDE=1 FILE=COLVAR-pcv-driver FMT=%8.4f\n')

# md.mdp: dt = 2fs, nstxout = 5000 => one frame per 10 ps

!{plumed} driver --mf_xtc {base2}.xtc --plumed plumed-pcv-driver.dat --timestep 10 --trajectory-stride 1

In [None]:
# COLVAR-ann: PRINT STRIDE=100 ~ 200 fs  => 50x finer than .xtc
stride = 50
pureann=np.loadtxt('COLVAR-ann')[::stride].T[1:3]

# driver pukes one more value
pcvonann=np.loadtxt('COLVAR-pcv-driver')[1:].T[1:3]

In [None]:
def plot_cv2(first,second,stride=1,start=0,stop=-1):
    both=np.concatenate((first,second),axis=1)
    xymin = np.min(both,axis=1) * 1.1
    xymax = np.max(both,axis=1) * 1.1
    first = first[:,::stride]
    second = second[:,::stride]

    if stop == -1:
        stop = first.shape[1]
        
    norm = plt.Normalize(0,first.shape[1])
    cmap = plt.get_cmap('rainbow')
    first = first[:,start:stop]
    second = second[:,start:stop]
    cb = range(start,stop)
    
    plt.figure(figsize=(14,12))
    plt.subplot(221)
    plt.xlim((xymin[0],xymax[0]))
    plt.ylim((xymin[1],xymax[1]))
    plt.scatter(*first,c=cb,marker='.',cmap=cmap,norm=norm)
    plt.colorbar(cmap=cmap,norm=norm)
    
    plt.subplot(222)
    plt.xlim((xymin[0],xymax[0]))
    plt.ylim((xymin[1],xymax[1]))
    plt.scatter(*second,c=cb,marker='.',cmap=cmap,norm=norm)
    plt.colorbar(cmap=cmap,norm=norm)
    
    plt.subplot(223)
    plt.plot(range(start,stop),first[0],label='first')
    plt.plot(range(start,stop),second[0],label='second')
    plt.legend()
    
    plt.subplot(224)
    plt.plot(range(start,stop),first[0],label='first')
    plt.plot(range(start,stop),second[1],label='second')
    plt.legend()
    
    plt.show()

In [None]:
# whole trajectory first
plot_cv2(pureann,pcvonann,stride=50,start=0,stop=-1)

In [None]:
# segments out of landmarks
plot_cv2(pureann,pcvonann,stride=50,start=220,stop=320)
# plot_cv2(pureann,pcvonann,stride=50,start=100,stop=150)

In [None]:
# TODO
# porovnani proti literature (jine CV)
# spojit trajektorie, udelat ruzne projekce -- esencialni souradnice i isomap, podivat se, jak tam vypadaji jednotlive trajektorie