# Mini project I

Content:
- I/O
- Featurisation
- Dimension reduction
- MSM estimation
- MSM validation

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import mdshare
import pyemma
import deeptime as dt
from timescales import implied_timescales_msm

In [None]:
pdb = mdshare.fetch('pentapeptide-impl-solv.pdb', working_directory='data')
files = mdshare.fetch('pentapeptide-*-500ns-impl-solv.xtc', working_directory='data')

## I/O, featurisation

Load two different molecular features:
- backbone torsions (with `cossin=True` and `periodic=False`)
- backbone atom positions (with `periodic=False`)

In [None]:
torsions_feat = pyemma.coordinates.featurizer(pdb)
torsions_feat.add_backbone_torsions(cossin=True, periodic=False)
torsions_data = pyemma.coordinates.load(files, features=torsions_feat)

positions_feat = pyemma.coordinates.featurizer(pdb)
positions_feat.add_selection(positions_feat.select_Backbone())
positions_data = pyemma.coordinates.load(files, features=positions_feat)

Compute VAMP scores for each molecular feature at a lag time of five steps and a dimensionality of four:

In [None]:
tica_estimator = dt.decomposition.TICA(lagtime=5, dim=4)
torsions_scores = dt.decomposition.vamp_score_cv(tica_estimator, torsions_data, lagtime=500)
print(f'Torsions:  {np.mean(torsions_scores):.2f}±{np.std(torsions_scores):.2f}')

positions_scores = dt.decomposition.vamp_score_cv(tica_estimator, positions_data, lagtime=500)
print(f'Positions: {np.mean(positions_scores):.2f}±{np.std(positions_scores):.2f}')

Perform a dimension reduction of the torsion data with TICA at a lag time of five steps and four independent components:

In [None]:
tica = tica_estimator.fit(torsions_data).fetch_model()
tica_output = [tica.transform(x) for x in torsions_data]

fig, axes = plt.subplots(1, 2, figsize=(10, 4))
pyemma.plots.plot_feature_histograms(
    np.concatenate(tica_output),
    ax=axes[0],
    feature_labels=['IC1', 'IC2', 'IC3', 'IC4'],
    ylog=True)
pyemma.plots.plot_density(*np.concatenate(tica_output)[:, :2].T, ax=axes[1], logscale=True)
axes[1].set_xlabel('IC 1')
axes[1].set_ylabel('IC 2')
fig.tight_layout()

Discretise with 75 $k$-means centers and a stride of 10:

In [None]:
cluster = dt.clustering.KMeans(75, max_iter=50, fixed_seed=1)
cluster = cluster.fit(np.concatenate(tica_output)).fetch_model()
dtrajs = [cluster.transform(x) for x in tica_output]

fig, ax = plt.subplots(figsize=(4, 4))
pyemma.plots.plot_free_energy(
    *np.concatenate(tica_output)[:, :2].T, ax=ax, cbar=False, legacy=False)
ax.scatter(*cluster.cluster_centers[:, :2].T, s=15, c='k')
ax.set_xlabel('IC 1')
ax.set_ylabel('IC 2')
fig.tight_layout()

Plot the first 10 implied timescales with errorbars up to a lag time of 50 steps (trajectory spacing is $0.1\frac{\mathrm{ns}}{\mathrm{step}}$):

In [None]:
from timescales import implied_timescales_msm

its = implied_timescales_msm(dtrajs, lagtimes=np.arange(1, 51), nits=10)
pyemma.plots.plot_implied_timescales(its, units='ns', dt=0.1);

Estimate a Bayesian MSM at a lag time of five steps and show a CK test for five metastable states:

In [None]:
counts = dt.markov.TransitionCountEstimator(5, 'effective').fit(dtrajs).fetch_model().submodel_largest()
msm = dt.markov.msm.BayesianMSM().fit(counts)

validator = msm.chapman_kolmogorov_validator(5, mlags=6)
cktest = validator.fit(dtrajs).fetch_model()
pyemma.plots.plot_cktest(cktest, dt=0.1, units='ns');