# Mini project I

Content:
- I/O
- Featurisation
- Dimension reduction
- MSM estimation
- MSM validation

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import mdshare
import pyemma


def score_cv(data, dim, lag, number_of_splits=10, validation_fraction=0.5):
    """Compute a cross-validated VAMP2 score.
    
    We randomly split the list of independent trajectories into
    a training and a validation set, compute the VAMP2 score,
    and repeat this process several times.
    
    Parameters
    ----------
    data : list of numpy.ndarrays
        The input data.
    dim : int
        Number of processes to score; equivalent to the dimension
        after projecting the data with VAMP2.
    lag : int
        Lag time for the VAMP2 scoring.
    number_of_splits : int, optional, default=10
        How often do we repeat the splitting and score calculation.
    validation_fraction : int, optional, default=0.5
        Fraction of trajectories which should go into the validation
        set during a split.
    """
    # we temporarily suppress very short-lived progress bars
    from pyemma.util.contexts import settings
    with settings(show_progress_bars=False):
        nval = int(len(data) * validation_fraction)
        scores = np.zeros(number_of_splits)
        for n in range(number_of_splits):
            ival = np.random.choice(len(data), size=nval, replace=False)
            vamp = pyemma.coordinates.vamp(
                [d for i, d in enumerate(data) if i not in ival], lag=lag, dim=dim)
            scores[n] = vamp.score([d for i, d in enumerate(data) if i in ival])
    return scores


pdb = mdshare.fetch('pentapeptide-impl-solv.pdb', working_directory='data')
files = mdshare.fetch('pentapeptide-*-500ns-impl-solv.xtc', working_directory='data')

## I/O, featurisation

Load two different molecular features:
- backbone torsions (with `cossin=True` and `periodic=False`)
- backbone atom positions (with `periodic=False`)

In [None]:
torsions_feat = pyemma.coordinates.featurizer(pdb)
torsions_feat.add_backbone_torsions(cossin=True, periodic=False)
torsions_data = pyemma.coordinates.load(files, features=torsions_feat)

positions_feat = pyemma.coordinates.featurizer(pdb)
positions_feat.add_selection(positions_feat.select_Backbone())
positions_data = pyemma.coordinates.load(files, features=positions_feat)

Compute VAMP scores for each molecular feature at a lag time of five steps and a dimensionality of four:

In [None]:
torsions_scores = score_cv(torsions_data, 5, 4)
print(f'Torsions:  {np.mean(torsions_scores):.2f}±{np.std(torsions_scores):.2f}')

positions_scores = score_cv(positions_data, 5, 4)
print(f'Positions: {np.mean(positions_scores):.2f}±{np.std(positions_scores):.2f}')

Perform a dimension reduction of the torsion data with TICA at a lag time of five steps and four independent components:

In [None]:
tica = pyemma.coordinates.tica(torsions_data, lag=5, dim=4)
tica_output = tica.get_output()
tica_concatenated = np.concatenate(tica_output)

fig, axes = plt.subplots(1, 2, figsize=(10, 4))
pyemma.plots.plot_feature_histograms(
    tica_concatenated,
    ax=axes[0],
    feature_labels=['IC1', 'IC2', 'IC3', 'IC4'],
    ylog=True)
pyemma.plots.plot_density(*tica_concatenated[:, :2].T, ax=axes[1], logscale=True)
axes[1].set_xlabel('IC 1')
axes[1].set_ylabel('IC 2')
fig.tight_layout()

Discretise with 75 $k$-means centers and a stride of 10:

In [None]:
cluster = pyemma.coordinates.cluster_kmeans(
    tica_output, k=75, max_iter=50, stride=10, fixed_seed=1)
dtrajs_concatenated = np.concatenate(cluster.dtrajs)

fig, ax = plt.subplots(figsize=(4, 4))
pyemma.plots.plot_free_energy(
    *tica_concatenated[:, :2].T, ax=ax, cbar=False, legacy=False)
ax.scatter(*cluster.clustercenters[:, :2].T, s=15, c='k')
ax.set_xlabel('IC 1')
ax.set_ylabel('IC 2')
fig.tight_layout()

Plot the first 10 implied timescales with errorbars up to a lag time of 50 steps (trajectory spacing is $0.1\frac{\mathrm{ns}}{\mathrm{step}}$):

In [None]:
its = pyemma.msm.its(cluster.dtrajs, lags=50, nits=10, errors='bayes')
pyemma.plots.plot_implied_timescales(its, units='ns', dt=0.1);

Estimate a Bayesian MSM at a lag time of five steps and show a CK test for five metastable states:

In [None]:
msm = pyemma.msm.bayesian_markov_model(cluster.dtrajs, lag=5, dt_traj='0.1 ns')

pyemma.plots.plot_cktest(msm.cktest(5, mlags=6), dt=0.1, units='ns');