In [1]:
# load example MD data

from msmbuilder.example_datasets import FsPeptide
from msmbuilder.featurizer import DihedralFeaturizer

fs = FsPeptide().get().trajectories
n_atoms = fs[0].n_atoms
fs_dih_feat = DihedralFeaturizer().transform(fs)

loading trajectory_1.xtc...
loading trajectory_10.xtc...
loading trajectory_11.xtc...
loading trajectory_12.xtc...
loading trajectory_13.xtc...
loading trajectory_14.xtc...
loading trajectory_15.xtc...
loading trajectory_16.xtc...
loading trajectory_17.xtc...
loading trajectory_18.xtc...
loading trajectory_19.xtc...
loading trajectory_2.xtc...
loading trajectory_20.xtc...
loading trajectory_21.xtc...
loading trajectory_22.xtc...
loading trajectory_23.xtc...
loading trajectory_24.xtc...
loading trajectory_25.xtc...
loading trajectory_26.xtc...
loading trajectory_27.xtc...
loading trajectory_28.xtc...
loading trajectory_3.xtc...
loading trajectory_4.xtc...
loading trajectory_5.xtc...
loading trajectory_6.xtc...
loading trajectory_7.xtc...
loading trajectory_8.xtc...
loading trajectory_9.xtc...


In [13]:
print(len(fs_dih_feat),fs_dih_feat[0].shape)
ndim = fs_dih_feat[0].shape[1]

(28, (10000, 84))


In [30]:
[len(traj) for traj in fs_dih_feat]

[10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000,
 10000]

In [31]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [114]:
# without time-lag: target = input

zero_mean = lambda X:X - X.mean(0)

X_train = np.vstack([traj[:8000] for traj in fs_dih_feat])
y_train = np.vstack([traj[:8000] for traj in fs_dih_feat])

X_test = np.vstack([traj[8000:] for traj in fs_dih_feat])
y_test = np.vstack([traj[8000:] for traj in fs_dih_feat])

In [115]:
# with time-lag: target = the frame tau steps after the input

tau = 1
X_train_t = np.vstack([traj[:8000-tau] for traj in fs_dih_feat])
y_train_t = np.vstack([traj[tau:8000] for traj in fs_dih_feat])

X_test_t = np.vstack([traj[8000:-tau] for traj in fs_dih_feat])
y_test_t = np.vstack([traj[8000+tau:] for traj in fs_dih_feat])

In [118]:
# build model
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD

def initialize_model(ndim=84,bottleneck=20):
    model = Sequential()
    model.add(Dense(ndim, ndim/2, init='uniform'))
    model.add(Activation('tanh'))
    model.add(Dense(ndim/2, bottleneck, init='uniform'))
    model.add(Activation('tanh'))
    model.add(Dense(bottleneck, ndim/2, init='uniform'))
    model.add(Activation('tanh'))
    model.add(Dense(ndim/2, ndim, init='uniform'))
    model.add(Activation('tanh'))

    sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='mean_squared_error', optimizer=sgd)
    return model

In [119]:
model = initialize_model()

In [106]:
score_before = model.evaluate(X_test, y_test, batch_size=16)
print(score_before)

0.217301841268


In [108]:
model.fit(X_train, y_train, nb_epoch=5, batch_size=16)
score = model.evaluate(X_test, y_test, batch_size=16)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4


In [107]:
print(score)

0.0490611214678


In [109]:
X_reconstructed = model.predict(X_train)

In [110]:
np.mean(((X_reconstructed - X_train)**2).sum(1))

4.7564325394314757

In [111]:
X_reconstructed_test = model.predict(X_test)
np.mean(((X_reconstructed_test - X_test)**2).sum(1))

5.6648203428348829

In [112]:
from sklearn.decomposition import PCA
pca = PCA(20)
pca.fit(X_train)
y = pca.transform(X_train)
X_reconstructed_pca = pca.inverse_transform(y)
np.mean(((X_reconstructed_pca - X_train)**2).sum(1))

5.2748446

In [113]:
pca = PCA(20)
pca.fit(X_train)
y = pca.transform(X_test)
X_reconstructed_pca_test = pca.inverse_transform(y)
np.mean(((X_reconstructed_pca_test - X_test)**2).sum(1))

5.7975378

In [None]:
# so it looks like we're overfitting! let's see if dropout fixes this

In [98]:
def initialize_dropout_model(ndim=84,bottleneck=20):
    model = Sequential()
    model.add(Dense(ndim, ndim/2, init='uniform'))
    model.add(Activation('tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(ndim/2, bottleneck, init='uniform'))
    model.add(Activation('tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(bottleneck, ndim/2, init='uniform'))
    model.add(Activation('tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(ndim/2, ndim, init='uniform'))
    model.add(Activation('tanh'))

    sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='mean_squared_error', optimizer=sgd)
    return model

In [99]:
model_d = initialize_dropout_model()
score_before_d = model_d.evaluate(X_test, y_test, batch_size=16)
print(score_before_d)

0.499997515815


In [100]:
model_d.fit(X_train, y_train, nb_epoch=5, batch_size=16)
score_d = model_d.evaluate(X_test, y_test, batch_size=16)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4


In [101]:
X_reconstructed_test_d = model_d.predict(X_test)
np.mean(((X_reconstructed_test_d - X_test)**2).sum(1))

12.542372326842933

In [87]:
model_t = initialize_model()

In [67]:
score_t_before = model_t.evaluate(X_test_t, y_test_t, batch_size=16)
print(score_t_before)

0.500124928788


In [68]:
model_t.fit(X_train_t, y_train_t, nb_epoch=5, batch_size=16)
score_t = model.evaluate(X_test_t, y_test_t, batch_size=16)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4


In [69]:
print(score_t)

0.130770606093


In [70]:
X_train[0]

array([-0.99183977, -0.96380568,  0.76381123, -0.95633799, -0.99000674,
       -0.98337376, -0.7332539 , -0.65258944, -0.88104635, -0.41120651,
       -0.99899232, -0.57467234, -0.81647223, -0.88175702, -0.64476657,
        0.86235702,  0.74185914, -0.74406517, -0.88981295, -0.99596512,
       -0.96402812,  0.12749065,  0.26660579,  0.64543974,  0.29226288,
        0.14102009,  0.18159303,  0.67995495,  0.75771171,  0.47302994,
       -0.91154224,  0.044881  , -0.81838357,  0.57738471,  0.47170395,
       -0.76437956,  0.50630069,  0.67055577,  0.66810703,  0.45632541,
       -0.08974114,  0.26580018, -0.63624775, -0.6840781 ,  0.45409024,
        0.94701457,  0.56205529, -0.3752068 , -0.5833227 , -0.47958723,
       -0.30344114,  0.17257567,  0.83907872,  0.47501677, -0.52139014,
       -0.14408888, -0.25162369,  0.59073681,  0.7006309 , -0.54538411,
       -0.44660074, -0.26289323,  0.9856469 ,  0.77148479,  0.72940874,
        0.89095569,  0.32119066, -0.82709962,  0.92694116,  0.81

In [72]:
X_train.max(),X_train.min()

(1.0, -1.0)

In [75]:
model_t.predict(X_train[:10])

array([[ -7.96021700e-01,  -7.93986619e-01,  -6.36180162e-01,
         -8.47349226e-01,  -8.58565688e-01,  -8.39037776e-01,
         -9.49761331e-01,  -9.13124204e-01,  -9.69934762e-01,
         -8.31109107e-01,  -8.99948478e-01,  -8.21225882e-01,
         -9.71444607e-01,  -8.60877275e-01,  -7.47213244e-01,
         -1.75116897e-01,   1.05961919e-01,  -3.43644112e-01,
         -8.11412990e-01,  -8.02834451e-01,  -5.76415837e-01,
          3.74361396e-01,   2.26654530e-01,   2.77057290e-03,
         -1.47431195e-02,   3.03767323e-02,   4.25179452e-01,
          3.14984322e-01,   2.18271226e-01,   2.63046801e-01,
          2.04513609e-01,  -4.85163182e-02,   7.60705769e-02,
          2.34572709e-01,   5.80834821e-02,  -1.11691110e-01,
          3.55886459e-01,   4.76908386e-01,   3.10939670e-01,
          3.06345783e-02,  -1.28974110e-01,  -1.55152604e-02,
         -2.50662535e-01,  -1.80942357e-01,  -1.22285411e-01,
         -8.16266239e-02,   3.60424608e-01,  -4.91279155e-01,
        

In [127]:
fs = FsPeptide().get()

loading trajectory_1.xtc...
loading trajectory_10.xtc...
loading trajectory_11.xtc...
loading trajectory_12.xtc...
loading trajectory_13.xtc...
loading trajectory_14.xtc...
loading trajectory_15.xtc...
loading trajectory_16.xtc...
loading trajectory_17.xtc...
loading trajectory_18.xtc...
loading trajectory_19.xtc...
loading trajectory_2.xtc...
loading trajectory_20.xtc...
loading trajectory_21.xtc...
loading trajectory_22.xtc...
loading trajectory_23.xtc...
loading trajectory_24.xtc...
loading trajectory_25.xtc...
loading trajectory_26.xtc...
loading trajectory_27.xtc...
loading trajectory_28.xtc...
loading trajectory_3.xtc...
loading trajectory_4.xtc...
loading trajectory_5.xtc...
loading trajectory_6.xtc...
loading trajectory_7.xtc...
loading trajectory_8.xtc...
loading trajectory_9.xtc...


In [128]:
fs?

In [131]:
print(fs.DESCR)

This dataset consists of 28 molecular dynamics trajectories of Fs peptide
(Ace-A_5(AAARA)_3A-NME), a widely studied model system for protein folding.
Each trajectory is 500 ns in length, and saved at a 50 ps time interval (14
us aggegrate sampling). The simulations were performed using the AMBER99SB-ILDN
force field with GBSA-OBC implicit solvent at 300K, starting from randomly
sampled conformations from an initial 400K unfolding simulation. The
simulations were performed with OpenMM 6.0.1.

The dataset, including the script used to generate the dataset
is available on figshare at

http://dx.doi.org/10.6084/m9.figshare.1030363



In [139]:
fs_t = fs.trajectories
fs_t[0]

<mdtraj.Trajectory with 10000 frames, 264 atoms, 23 residues, without unitcells at 0x168980390>

In [135]:
import mdtraj as md
md.compute_chi1(fs)

AttributeError: 'Bunch' object has no attribute 'top'

In [136]:
fs

{'DESCR': 'This dataset consists of 28 molecular dynamics trajectories of Fs peptide\n(Ace-A_5(AAARA)_3A-NME), a widely studied model system for protein folding.\nEach trajectory is 500 ns in length, and saved at a 50 ps time interval (14\nus aggegrate sampling). The simulations were performed using the AMBER99SB-ILDN\nforce field with GBSA-OBC implicit solvent at 300K, starting from randomly\nsampled conformations from an initial 400K unfolding simulation. The\nsimulations were performed with OpenMM 6.0.1.\n\nThe dataset, including the script used to generate the dataset\nis available on figshare at\n\nhttp://dx.doi.org/10.6084/m9.figshare.1030363\n',
 'trajectories': [<mdtraj.Trajectory with 10000 frames, 264 atoms, 23 residues, without unitcells at 0x168980390>,
  <mdtraj.Trajectory with 10000 frames, 264 atoms, 23 residues, without unitcells at 0x168a66e90>,
  <mdtraj.Trajectory with 10000 frames, 264 atoms, 23 residues, without unitcells at 0x168a66210>,
  <mdtraj.Trajectory with 