# 02 - Dimension reduction and discretisation

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import mdshare
import pyemma

In [None]:
file = mdshare.fetch('hmm-doublewell-2d-100k.npz', working_directory='data')
with np.load(file) as fh:
    data = fh['trajectory']

fig, axes = plt.subplots(1, 2, figsize=(10, 4))
pyemma.plots.plot_feature_histograms(data, feature_labels=['$x$', '$y$'], ax=axes[0])
axes[1].scatter(*data.T, s=1, alpha=0.3)
axes[1].set_xlabel('$x$')
axes[1].set_ylabel('$y$')
fig.tight_layout()

In [None]:
cluster_kmeans = pyemma.coordinates.cluster_kmeans(data, k=200)

In [None]:
cluster_regspace = pyemma.coordinates.cluster_regspace(data, dmin=0.3)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
for ax, cls in zip(axes.flat, [cluster_kmeans, cluster_regspace]):
    ax.scatter(*data.T, s=1, alpha=0.3)
    ax.scatter(*cls.clustercenters.T, s=15)
    ax.set_xlabel('$x$')
    ax.set_ylabel('$y$')
fig.tight_layout()

In [None]:
print(cluster_kmeans.dtrajs)
print(cluster_regspace.dtrajs)

In [None]:
pca = pyemma.coordinates.pca(data, dim=1)
pca_out = pca.get_output()
print(pca_out)

In [None]:
tica = pyemma.coordinates.tica(data, dim=1, lag=1)
tica_out = tica.get_output()
print(tica_out)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
pyemma.plots.plot_feature_histograms(
    np.concatenate([pca_out[0], tica_out[0]], axis=1),
    feature_labels=['PCA', 'TICA'],
    ax=axes[0])
axes[1].scatter(*data.T, s=0.1, alpha=0.3, c='grey')
axes[1].plot(
    [0, 3 * abs(pca.eigenvectors[0, 0])],
    [0, 3 * abs(pca.eigenvectors[1, 0])],
    linewidth=3,
    label='PCA')
axes[1].plot(
    [0, 3 * abs(tica.eigenvectors[0, 0])],
    [0, 3 * abs(tica.eigenvectors[1, 0])],
    linewidth=3,
    label='TICA')
axes[1].set_xlabel('$x$')
axes[1].set_ylabel('$y$')
fig.tight_layout()

In [None]:
fig, ax = plt.subplots(figsize=(10, 3))
ax.plot(pca_out[0][:300], label='PCA')
ax.plot(tica_out[0][:300], label='TICA')
ax.set_xlabel('time / steps')
ax.set_ylabel('feature values')
ax.legend()
fig.tight_layout()

In [None]:
centers_pca = np.linspace(np.min(pca_out), np.max(pca_out), 50)
dtrajs_pca = pyemma.coordinates.assign_to_centers(pca_out, centers=centers_pca.reshape(-1, 1))
print(dtrajs_pca)

In [None]:
centers_tica = np.linspace(np.min(tica_out), np.max(tica_out), 50)
dtrajs_tica = pyemma.coordinates.assign_to_centers(tica_out, centers=centers_tica.reshape(-1, 1))
print(dtrajs_tica)

In [None]:
pyemma.plots.plot_feature_histograms(
    np.concatenate([dtrajs_pca, dtrajs_tica]).T,
    feature_labels=['PCA disc. state', 'TICA disc. state'])

In [None]:
pdb = mdshare.fetch('alanine-dipeptide-nowater.pdb', working_directory='data')
files = mdshare.fetch('alanine-dipeptide-*-250ns-nowater.dcd', working_directory='data')

feat = pyemma.coordinates.featurizer(pdb)
feat.add_backbone_torsions()
data = pyemma.coordinates.load(files, features=feat)

fig, axes = plt.subplots(1, 2, figsize=(10, 4))
pyemma.plots.plot_feature_histograms(np.concatenate(data), feature_labels=['$\Phi$', '$\Psi$'], ax=axes[0])
axes[1].scatter(*np.concatenate(data).T, s=1, alpha=0.3)
axes[1].set_xlabel('$\Phi$')
axes[1].set_ylabel('$\Psi$')
fig.tight_layout()

In [None]:
cluster_kmeans = pyemma.coordinates.cluster_kmeans(data, k=200)
cluster_regspace = pyemma.coordinates.cluster_regspace(data, dmin=0.3)

fig, axes = plt.subplots(1, 2, figsize=(10, 4))
for ax, cls in zip(axes.flat, [cluster_kmeans, cluster_regspace]):
    ax.scatter(*np.concatenate(data).T, s=1, alpha=0.3)
    ax.scatter(*cls.clustercenters.T, s=15)
    ax.set_xlabel('$x$')
    ax.set_ylabel('$y$')
fig.tight_layout()

In [None]:
feat = pyemma.coordinates.featurizer(pdb)
feat.add_selection(feat.select_Heavy())
data = pyemma.coordinates.load(files, features=feat)

fig, ax = plt.subplots(figsize=(10, 7))
pyemma.plots.plot_feature_histograms(np.concatenate(data), feature_labels=feat.describe(), ax=ax)
fig.tight_layout()

In [None]:
pca = pyemma.coordinates.pca(data)
pca_all = np.concatenate(pca.get_output())

fig, axes = plt.subplots(1, 3, figsize=(12, 4))
pyemma.plots.plot_feature_histograms(
    pca_all, ['PC %d' % (i + 1) for i in range(pca.dimension())], ax=axes[0])
axes[1].scatter(*pca_all[:, :2].T, s=1, alpha=0.3)
pyemma.plots.plot_free_energy(*pca_all[:, :2].T, ax=axes[2])
for ax in axes.flat[1:]:
    ax.set_xlabel('PC 1')
    ax.set_ylabel('PC 2')
fig.tight_layout()

In [None]:
tica = pyemma.coordinates.tica(data, lag=1)
tica_all = np.concatenate(tica.get_output())

fig, axes = plt.subplots(1, 3, figsize=(12, 4))
pyemma.plots.plot_feature_histograms(
    tica_all, ['IC %d' % (i + 1) for i in range(tica.dimension())], ax=axes[0])
axes[1].scatter(*tica_all[:, :2].T, s=1, alpha=0.3)
pyemma.plots.plot_free_energy(*tica_all[:, :2].T, ax=axes[2])
for ax in axes.flat[1:]:
    ax.set_xlabel('IC 1')
    ax.set_ylabel('IC 2')
fig.tight_layout()

In [None]:
pca = pyemma.coordinates.pca(data, dim=2)
pca_all = np.concatenate(pca.get_output())

cluster = pyemma.coordinates.cluster_kmeans(pca, k=200)

fig, axes = plt.subplots(1, 2, figsize=(10, 4))
pyemma.plots.plot_feature_histograms(
    pca_all, ['PC %d' % (i + 1) for i in range(pca.dimension())], ax=axes[0])
axes[1].scatter(*pca_all.T, s=1, alpha=0.3)
axes[1].scatter(*cluster.clustercenters.T, s=15)
axes[1].set_xlabel('PC 1')
axes[1].set_ylabel('PC 2')
fig.tight_layout()

In [None]:
tica = pyemma.coordinates.tica(data, lag=1, dim=2)
tica_all = np.concatenate(tica.get_output())

cluster = pyemma.coordinates.cluster_kmeans(tica, k=200)

fig, axes = plt.subplots(1, 2, figsize=(10, 4))
pyemma.plots.plot_feature_histograms(
    tica_all, ['IC %d' % (i + 1) for i in range(tica.dimension())], ax=axes[0])
axes[1].scatter(*tica_all.T, s=1, alpha=0.3)
axes[1].scatter(*cluster.clustercenters.T, s=15)
axes[1].set_xlabel('IC 1')
axes[1].set_ylabel('IC 2')
fig.tight_layout()

In [None]:
pdb = mdshare.fetch('pentapeptide-impl-solv.pdb', working_directory='data')
files = mdshare.fetch('pentapeptide-*-500ns-impl-solv.xtc', working_directory='data')

feat = pyemma.coordinates.featurizer(pdb)
feat.add_backbone_torsions(cossin=True)
feat.add_sidechain_torsions(which='chi1', cossin=True)

data = pyemma.coordinates.load(files, features=feat)

fig, ax = plt.subplots(figsize=(10, 7))
pyemma.plots.plot_feature_histograms(np.concatenate(data), feature_labels=feat.describe(), ax=ax)
fig.tight_layout()

In [None]:
pca = pyemma.coordinates.pca(data)
pca_all = np.concatenate(pca.get_output())

fig, axes = plt.subplots(1, 3, figsize=(12, 4))
pyemma.plots.plot_feature_histograms(
    pca_all, ['PC %d' % (i + 1) for i in range(pca.dimension())], ax=axes[0])
axes[1].scatter(*pca_all[:, :2].T, s=1, alpha=0.3)
pyemma.plots.plot_free_energy(*pca_all[:, :2].T, ax=axes[2])
for ax in axes.flat[1:]:
    ax.set_xlabel('PC 1')
    ax.set_ylabel('PC 2')
fig.tight_layout()

In [None]:
lags = [1, 2, 5, 10, 20, 50]

fig, axes = plt.subplots(len(lags), 3, figsize=(10, len(lags) * 3))
for i, lag in enumerate(lags):
    tica = pyemma.coordinates.tica(data, lag=lag)
    tica_all = np.concatenate(tica.get_output())
    pyemma.plots.plot_feature_histograms(
        tica_all, ['IC %d' % (i + 1) for i in range(tica.dimension())], ax=axes[i, 0])
    axes[i, 1].scatter(*tica_all[:, :2].T, s=1, alpha=0.3)
    pyemma.plots.plot_free_energy(*tica_all[:, :2].T, ax=axes[i, 2], cbar=False)
for ax in axes[:, 1:].flat:
    ax.set_xlabel('IC 1')
    ax.set_ylabel('IC 2')
fig.tight_layout()

In [None]:
pca = pyemma.coordinates.pca(data, dim=3)
pca_all = np.concatenate(pca.get_output(stride=5))

cluster = pyemma.coordinates.cluster_kmeans(pca, k=200, stride=5)

fig, axes = plt.subplots(2, 2, figsize=(10, 8))
pyemma.plots.plot_feature_histograms(
    pca_all, ['PC %d' % (i + 1) for i in range(pca.dimension())], ax=axes[0, 0])
for ax, (i, j) in zip(axes.flat[1:], [[0, 1], [0, 2], [1, 2]]):
    ax.scatter(*pca_all[:, [i, j]].T, s=1, alpha=0.3)
    ax.scatter(*cluster.clustercenters[:, [i, j]].T, s=15)
    ax.set_xlabel('PC %d' % (i + 1))
    ax.set_ylabel('PC %d' % (j + 1))
fig.tight_layout()

In [None]:
tica = pyemma.coordinates.tica(data, lag=20, dim=3)
tica_all = np.concatenate(tica.get_output(stride=5))

cluster = pyemma.coordinates.cluster_kmeans(tica, k=200, stride=5)

fig, axes = plt.subplots(2, 2, figsize=(10, 8))
pyemma.plots.plot_feature_histograms(
    tica_all, ['IC %d' % (i + 1) for i in range(tica.dimension())], ax=axes[0, 0])
for ax, (i, j) in zip(axes.flat[1:], [[0, 1], [0, 2], [1, 2]]):
    ax.scatter(*tica_all[:, [i, j]].T, s=1, alpha=0.3)
    ax.scatter(*cluster.clustercenters[:, [i, j]].T, s=15)
    ax.set_xlabel('IC %d' % (i + 1))
    ax.set_ylabel('IC %d' % (j + 1))
fig.tight_layout()