# PyEMMA featurization example notebook
## 0. Load MD data from FTP server
Ingredients:
- Topology file: PDB
- Trajectory data: List of .DCD files

In [None]:
from mdshare import load

In [None]:
topfile = load('alanine-dipeptide-nowater.pdb', working_directory='data')
traj_list = [load('alanine-dipeptide-%d-250ns-nowater.dcd' % i, working_directory='data') for i in range(3)]

#### Import PyEMMA

In [None]:
import pyemma
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParams.update({'font.size': 16})

## 1. Several ways of processing the same data
### 1.1 Backbone torsions
- The best possible discription
- Two dimensions that discribe the full dynamics
- A priori known

In [None]:
bbtorsion_feat = pyemma.coordinates.featurizer(topfile)
bbtorsion_feat.add_backbone_torsions()

In [None]:
bbtorsions = pyemma.coordinates.load(traj_list, bbtorsion_feat)

In [None]:
pyemma.plots.plot_free_energy(np.concatenate(bbtorsions)[:, 0], np.concatenate(bbtorsions)[:, 1])
plt.xlabel('$\Phi$ / rad') 
plt.ylabel('$\Psi$ / rad');

### 1.2 heavy atom distances
- without prior knowledge usually a good choice
- very high dimensional even for this system

In [None]:
heavy_atom_dist_feat = pyemma.coordinates.featurizer(topfile)
heavy_atom_indices = heavy_atom_dist_feat.select_Heavy()

In [None]:
print(heavy_atom_indices)

In [None]:
heavy_atom_dist_feat.add_distances(heavy_atom_indices, periodic=False)

In [None]:
heavy_atom_dist_feat.dimension()

In [None]:
heavy_atom_distances = pyemma.coordinates.load(traj_list, heavy_atom_dist_feat)

In [None]:
fig, ax = plt.subplots(figsize=(10, 15))
for h, coordinate in enumerate(np.concatenate(heavy_atom_distances).T):
    hist, edges = np.histogram(coordinate, bins=50)
    y = hist / np.max(hist)
    ax.fill_between(edges[:-1], y + h - .2, color='b', y2=h - .2, alpha=.25)

    ax.axhline(y=h - .2, xmin=0, xmax=1, color='k', linewidth=.2)

ax.set_yticks(np.array(range(heavy_atom_dist_feat.dimension()))+.3)
ax.set_yticklabels(heavy_atom_dist_feat.describe());
ax.set_xlabel('heavy atom distance');
ax.set_title('distance histograms per dimension (normalized)');

#### 1.2.1 TICA projection of heavy atom distances

In [None]:
tica = pyemma.coordinates.tica(heavy_atom_distances)

In [None]:
tica.dimension()

In [None]:
tics = tica.get_output()

In [None]:
pyemma.plots.plot_free_energy(np.concatenate(tics)[:, 0], np.concatenate(tics)[:, 1])
plt.xlabel('TIC 1') 
plt.ylabel('TIC 2');

#### 1.2.1 PCA projection of heavy atom distances

In [None]:
pca = pyemma.coordinates.pca(heavy_atom_distances, var_cutoff=.9)

In [None]:
pca.dimension()

In [None]:
pcs = pca.get_output()

In [None]:
pyemma.plots.plot_free_energy(np.concatenate(pcs)[:, 0], np.concatenate(pcs)[:, 1])
plt.xlabel('IC 1') 
plt.ylabel('IC 2');

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
t = ['backbone torsions', 'TICs', 'PCs']
for n, _y in enumerate([bbtorsions, tics, pcs]):
    pyemma.plots.plot_free_energy(np.concatenate(_y)[:, 0], np.concatenate(_y)[:, 1], ax=axes[n], cbar=False)
    axes[n].set_title(t[n])


## Different ways of discretizing the output
### 3. clustering
#### 3.1 k-means

In [None]:
y = bbtorsions
clustering_kmeans = pyemma.coordinates.cluster_kmeans(y, k=75, stride=10, max_iter=30)

In [None]:
plt.plot(*clustering_kmeans.clustercenters.T, 'ko')
pyemma.plots.plot_free_energy(*np.concatenate(y).T)
plt.xlabel('$\Phi$ / rad') 
plt.ylabel('$\Psi$ / rad');

#### 3.2 regular space clustering

In [None]:
clustering_regspace = pyemma.coordinates.cluster_regspace(y, dmin=.4)
clustering_regspace.n_clusters

In [None]:
plt.plot(*clustering_regspace.clustercenters.T, 'ko')
pyemma.plots.plot_free_energy(*np.concatenate(y).T)
plt.xlabel('$\Phi$ / rad') 
plt.ylabel('$\Psi$ / rad');

### 3.3 discrete trajectories

In [None]:
dtrajs = clustering_kmeans.dtrajs

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(15, 8), sharex=True)
b, e = 20400, 21100
ax[0].plot(y[0][b:e, 0], alpha=.75, label='$\Phi$')
ax[0].plot(y[0][b:e, 1], alpha=.75, label='$\Psi$')
ax[0].set_ylabel('backbone torsion angles')
ax[0].legend()
ax[1].step(range(dtrajs[0][b:e].shape[0]), dtrajs[0][b:e])
ax[1].set_xlabel('time (steps)')
ax[1].set_ylabel('state')
fig.tight_layout()

In [None]:
print(dtrajs[0][:25])

## A quick MSM estimate to check our work

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(15, 13))
t = ['backbone torsions', 'TICs', 'PCs']
for n, _y in enumerate([bbtorsions, tics, pcs]):
    pyemma.plots.plot_free_energy(*np.concatenate(_y).T, ax=axes[0][n], cbar=False)
    axes[0][n].set_title(t[n], fontweight='bold')
    
    cl_objs = [pyemma.coordinates.cluster_kmeans(_y, k=75, max_iter=30, stride=100),
               pyemma.coordinates.cluster_regspace(_y, dmin=.4 if n==0 else .4/(2.2*n), stride=100)]
    for cl_n, cl_obj in enumerate(cl_objs):
        axes[0][n].plot(*cl_obj.clustercenters.T, 'ko' if cl_n == 0 else 'rs', alpha=.8)
        its = pyemma.msm.its(cl_obj.dtrajs, lags=[1, 2, 4, 6, 8])
        pyemma.plots.plot_implied_timescales(its, ax=axes[cl_n+1][n])
        axes[cl_n+1][n].set_ylim(1e-1, 3e3)
        axes[cl_n+1][n].set_ylabel('')
axes[1][0].set_ylabel('k-means clustering', fontweight='bold')
axes[2][0].set_ylabel('regspace clustering', fontweight='bold')
fig.tight_layout()