Make sure you extracted following files into the same directory as this notebook:
- training_50_dih.joblib
- validation_dih.joblib
- le.joblib
- ohe.joblib

And run 'pip install joblib'

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import tensorflow as tf
import time

In [2]:
def dihedral(p):
    """Praxeolitic formula
    1 sqrt, 1 cross product"""
    p0 = p[0]
    p1 = p[1]
    p2 = p[2]
    p3 = p[3]

    b0 = -1.0*(p1 - p0)
    b1 = p2 - p1
    b2 = p3 - p2

    # normalize b1 so that it does not influence magnitude of vector
    # rejections that come next
    b1 /= np.linalg.norm(b1)

    # vector rejections
    # v = projection of b0 onto plane perpendicular to b1
    #   = b0 minus component that aligns with b1
    # w = projection of b2 onto plane perpendicular to b1
    #   = b2 minus component that aligns with b1
    v = b0 - np.dot(b0, b1)*b1
    w = b2 - np.dot(b2, b1)*b1

    # angle between v and w in a plane is the torsion angle
    # v and w may not be normalized but that's fine since tan is y/x
    x = np.dot(v, w)
    y = np.dot(np.cross(b1, v), w)
    return np.arctan2(y, x)

Modify the path below

In [3]:
data_order = ["[ID]", "[PRIMARY]", "[EVOLUTIONARY]", "[TERTIARY]", "[MASK]"]

# change this path to match the path of where you have the
# training_50 and validation files
txt_data_path = '/home/mikey/Data/ProteinNet/casp7_txt/'

In [4]:
from tqdm import tqdm
from sklearn.externals import joblib

def count_protein(raw_txt_data):
    data = filter_line_end(raw_txt_data)
    result = 0
    for line in data:
        if line == data_order[0]: #id
            result += 1
    return result

def filter_line_end(data):
    return [str_.replace('\n', '') for str_ in data]

def get_mask_from_all_data(data, lim):
    result = []
    protein_count = 0
    flag = False
    for line in data:
        if line == data_order[0]: #id
            flag = False
        if flag:
            result.append(line)
        if line == data_order[4]: #mask
            flag = True
        if line == data_order[0]: #id
            protein_count += 1
        if lim and protein_count > lim:
            return result
    return result

def get_primary_from_all_data(data, lim):
    result = []
    protein_count = 0
    flag = False
    for line in data:
        if line == data_order[2]: #evolutionary
            flag = False
        if flag:
            result.append(line)
        if line == data_order[1]: #primary
            flag = True
        if line == data_order[0]: #id
            protein_count += 1
        if lim and protein_count > lim:
            return result
    return result

def get_evolutionary_from_all_data(data, lim):
    result = []
    protein_count = 0
    flag = False
    for line in data:
        if line == data_order[-2]: #mask
            flag = False
        if flag:
            result.append(line)
        if line == data_order[-3]: #tertiary
            flag = True
        if line == data_order[0]: #id
            protein_count += 1
        if lim and protein_count > lim:
            return result
    return result

def get_tertiary_from_all_data(data, lim):
    result = []
    protein_count = 0
    flag = False
    for line in data:
        if line == data_order[-1]: #mask
            flag = False
        if flag:
            result.append(line)
        if line == data_order[-2]: #tertiary
            flag = True
        if line == data_order[0]: #id
            protein_count += 1
        if lim and protein_count > lim:
            return result
    return result

def group_aminoacids_together(data, every_n):
    data_expanded = [np.asarray(np.expand_dims(t.split('\t'), 1), dtype=np.float32) for t in data]
    result = []
    for i in tqdm(range(0,len(data_expanded),every_n)):
        # group together every_n entries (e.g. 3 for tertiary and 21 for evo)
        result.append(np.concatenate([data_expanded[i+r] for r in range(every_n)], axis=1))
        
    return result

def parse_tertiary_from_file(path, data_lim=None):
    with open(path) as f:
        data = f.readlines()
    
    data_ = filter_line_end(data[:data_lim])        
    only_tertiary = get_tertiary_from_all_data(data_, data_lim)
    return group_aminoacids_together(only_tertiary, every_n=3)

def parse_evolutionary_from_file(path, data_lim=None):
    with open(txt_data_path + file_name) as f:
        data = f.readlines()

    data_ = filter_line_end(data)
    print("Loaded data and filtered line endings")
    only_evo = get_evolutionary_from_all_data(data_, data_lim)
    print("Extracted evolutionary data")
    res = group_aminoacids_together(only_evo, every_n = 21)
    print("Grouped 21's together")
    return res

def parse_primary_from_file(path, data_lim=None):
    with open(txt_data_path + file_name) as f:
        data = f.readlines()
    
    data_ = filter_line_end(data)
    print("Loaded data and filtered line endings")
    primary = get_primary_from_all_data(data_, data_lim)
    print("Extracted primary data")
    le = load_file('le.joblib')
    ohe = load_file('ohe.joblib')
    primary_in_floats = [le.transform([_ for _ in c]) for c in primary]
    primary_encoded = [ohe.transform(a.reshape(-1,1)).toarray() for a in primary_in_floats]
    print("Encoded primary sequences")
    return primary_encoded

def parse_mask_from_file(path, data_lim=None):
    with open(txt_data_path + file_name) as f:
        data = f.readlines()

    data_ = filter_line_end(data)
    print("Loaded data and filtered line endings")
    only_mask = get_mask_from_all_data(data_, data_lim)
    only_mask = [mask for mask in only_mask if mask != '']
    only_mask_ = []
    for mask in only_mask:
        only_mask_.append(np.array(list(mask)) == '+')

    print("Extracted mask data")
    return only_mask_

def get_dih(protein_tertiary):
    p = protein_tertiary
    r = p.shape[0]
    a_list = list(range(r))
    the_list = np.array([a_list[slice(i, i+4)] for i in range(r - 4+1)])
    slices = np.asarray(p[the_list], dtype=np.float32)
    one_dih = np.array([dihedral(slice_) for slice_ in slices])
    one_dih = np.insert(one_dih, 0, None)
    one_dih = np.append(one_dih, [None,None])
    return one_dih.reshape(-1,3)

def save_file(data, path):
    joblib.dump(data, path) 
    
def load_file(path):
    return joblib.load(path)

This was used to generate the dihedral angles

In [5]:
# file_name = 'training_50'
# # file_name = 'validation'
# tertiary = parse_tertiary_from_file(txt_data_path + file_name)

# dih = []
# for protein in tqdm(tertiary):
#     dih.append(get_dih(protein))
# save_file(dih, file_name + '_dih.joblib')

The data_lim is important because so far the full 13000 proteins crashes my system with 16Gb of ram

Load training data

In [78]:
file_name = 'training_50'
data_lim = 8000
prim_train, evo_train, dih_train, mask_train = (parse_primary_from_file(txt_data_path + file_name, data_lim), 
                                    parse_evolutionary_from_file(txt_data_path + file_name, data_lim),
                                    load_file('./'+file_name+'_dih.joblib'),
                                               parse_mask_from_file(txt_data_path + file_name, data_lim))
dih_train = dih_train[:data_lim]
len(prim_train), prim_train[0].shape, len(evo_train), evo_train[0].shape, len(dih_train), dih_train[0].shape, len(mask_train), mask_train[0].shape

Loaded data and filtered line endings
Extracted primary data
Encoded primary sequences
Loaded data and filtered line endings
Extracted evolutionary data


100%|██████████| 8000/8000 [00:00<00:00, 75934.07it/s]


Grouped 21's together
Loaded data and filtered line endings
Extracted mask data


(8000, (70, 20), 8000, (70, 21), 8000, (70, 3), 8000, (70,))

Load validation data

In [79]:
np.__version__

'1.15.4'

In [80]:
file_name = 'validation'
prim_valid, evo_valid, dih_valid, mask_valid = (parse_primary_from_file(txt_data_path + file_name), 
                                    parse_evolutionary_from_file(txt_data_path + file_name),
                                    load_file('./'+ file_name + '_dih.joblib'),
                                               parse_mask_from_file(txt_data_path + file_name))
len(prim_valid), prim_valid[0].shape, len(evo_valid), evo_valid[0].shape, len(dih_valid), dih_valid[0].shape, len(mask_valid), mask_valid[0].shape

Loaded data and filtered line endings
Extracted primary data
Encoded primary sequences
Loaded data and filtered line endings
Extracted evolutionary data


100%|██████████| 224/224 [00:00<00:00, 57568.88it/s]

Grouped 21's together
Loaded data and filtered line endings
Extracted mask data





(224, (269, 20), 224, (269, 21), 224, (269, 3), 224, (269,))

Pad the data and limit protein length. Prepare the data for training

In [92]:
def fix_nans(a):
    where_are_NaNs = np.isnan(a)
    a[where_are_NaNs] = 0
    return a

def pad_array(array, max_len, value=0., dtype=np.float32):
    print("padded")
    if(len(array[0].shape) < 2):
        array = np.array([a.reshape(-1,1) for a in array])
    print(array.shape, array[0].shape)
    return np.asarray([np.pad(a, [(max_len - len(a), 0), (0,0)], mode='constant', constant_values=value) for a in array], dtype=dtype)

def limit_length_and_pad(prim, evo, dih, mask, max_length):
    len_mask = np.array([len(el) for el in prim]) <= max_length
    prim_lim, evo_lim, dih_lim, mask_lim = (np.array(prim)[len_mask], np.array(evo)[len_mask], 
                                            np.array(dih)[len_mask], np.array(mask)[len_mask])
    prim_pad, evo_pad, dih_pad, mask_pad = (pad_array(prim_lim, max_length), pad_array(evo_lim, max_length), 
                                            pad_array(dih_lim, max_length), pad_array(mask_lim, max_length, value=False, dtype=np.bool))
    mask_pad = mask_pad.reshape(mask_pad.shape[0], -1) # this is necessary because numpy expects this shape to use mask as an index
    return fix_nans(prim_pad), fix_nans(evo_pad), fix_nans(dih_pad), fix_nans(mask_pad)

max_len = 500
prim_, evo_, dih_, mask_ = limit_length_and_pad(prim_train, evo_train, dih_train, mask_train, max_len)
x_train, y_train = np.concatenate([prim_, evo_], axis=2), dih_
prim_v, evo_v, dih_v, mask_v = limit_length_and_pad(prim_valid, evo_valid, dih_valid, mask_valid, max_len)
x_valid, y_valid = np.concatenate([prim_v, evo_v], axis=2), dih_v

padded
(7623,) (70, 20)
padded
(7623,) (70, 21)
padded
(7623,) (70, 3)
padded
(7623,) (70, 1)
padded
(213,) (269, 20)
padded
(213,) (269, 21)
padded
(213,) (269, 3)
padded
(213,) (269, 1)


In [93]:
x_train.shape

(7623, 500, 41)

In [94]:
x_train[np.logical_not(mask_)] = -1.
x_valid[np.logical_not(mask_v)] = -1.

In [95]:
histories = []

Run a simple regression task to check if it trains on the data

In [96]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Lambda, Masking, concatenate
from keras.layers import LSTM, Conv1D, Input

inputs = Input(shape=(max_len,41))
x1 = Masking(mask_value=-1.)(inputs)
x2 = LSTM(12, return_sequences=True)(x1)
# x12 = concatenate([x1,x2])
# x3 = LSTM(64, return_sequences=True)(x2)
# x23 = concatenate([x2,x3])
x4 = Dense(3, activation='tanh')(x2)
y = Lambda(lambda x: x*np.pi)(x4)
model = Model(inputs=inputs, outputs=y)

# model = Sequential()
# model.add(Masking(mask_value=-1.))
# model.add(LSTM(128, return_sequences=True))
# # model.add(LSTM(128, return_sequences=True))
# # model.add(Conv1D(32, 15, padding='same'))
# # model.add(Conv1D(64, 15, padding='same'))
# # model.add(Conv1D(128, 15, padding='same'))
# # model.add(Dropout(0.5))
# model.add(Dense(3, activation='tanh'))
# model.add(Lambda(lambda x: x*np.pi))

model.compile(loss='mean_absolute_error',
              optimizer='adam',
              metrics=['mean_absolute_error'])

history = model.fit(x_train, y_train, validation_data=(x_valid, y_valid), batch_size=100, epochs=10)
histories.append(history)
# score = model.evaluate(x_valid, y_valid, batch_size=16)

Train on 7623 samples, validate on 213 samples
Epoch 1/10
 900/7623 [==>...........................] - ETA: 40s - loss: 0.6327 - mean_absolute_error: 0.6327

KeyboardInterrupt: 

In [12]:
for layer in model.layers:
    print(layer.name, layer.input_shape, layer.output_shape)

conv1d_1 (None, 200, 41) (None, 200, 32)
conv1d_2 (None, 200, 32) (None, 200, 64)
conv1d_3 (None, 200, 64) (None, 200, 128)
dense_1 (None, 200, 128) (None, 200, 3)
lambda_1 (None, 200, 3) (None, 200, 3)


In [134]:
def atan2(x, y, epsilon=1.0e-12):
    x = tf.where(tf.equal(x, 0.0), x+epsilon, x)
    y = tf.where(tf.equal(y, 0.0), y+epsilon, y)    
    angle = tf.where(tf.greater(x,0.0), tf.atan(y/x), tf.zeros_like(x))
    angle = tf.where(tf.logical_and(tf.less(x,0.0),  tf.greater_equal(y,0.0)), tf.atan(y/x) + np.pi, angle)
    angle = tf.where(tf.logical_and(tf.less(x,0.0),  tf.less(y,0.0)), tf.atan(y/x) - np.pi, angle)
    angle = tf.where(tf.logical_and(tf.equal(x,0.0), tf.greater(y,0.0)), 0.5*np.pi * tf.ones_like(x), angle)
    angle = tf.where(tf.logical_and(tf.equal(x,0.0), tf.less(y,0.0)), -0.5*np.pi * tf.ones_like(x), angle)
    angle = tf.where(tf.logical_and(tf.equal(x,0.0), tf.equal(y,0.0)), tf.zeros_like(x), angle)
    return angle

# y in radians
def rmse_360_2(y_true, y_pred):
    y_pred = tf.reshape(y_pred, shape=(tf.shape(y_pred)[0], tf.shape(y_pred)[1], 3, 2))
    y1 = atan2(y_pred[:,:,0,0], y_pred[:,:,0,1]), atan2(y_pred[:,:,1,0], y_pred[:,:,1,1]), atan2(y_pred[:,:,2,0], y_pred[:,:,2,1]) 
    y1 = tf.transpose(y1, perm=[1,2,0])
    return K.mean(K.abs(y1 - y_true))

In [118]:
with tf.Session().as_default():
    print(atan2(np.array([1.,1.]),np.array([1., -1])).eval())

[ 0.78539816 -0.78539816]


In [148]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Lambda
from keras.layers import LSTM, Conv1D
import keras.backend as K
from keras.optimizers import Adam


model = Sequential()
# model.add(LSTM(128, return_sequences=True))
model.add(Conv1D(32, 5, padding='same'))
model.add(Conv1D(64, 10, padding='same'))
model.add(LSTM(16, return_sequences=True))
# model.add(Conv1D(128, 15, padding='same'))
# model.add(Dropout(0.5))
model.add(Dense(6))
# model.add(Lambda(lambda x: x*np.pi))

optm = Adam(lr=0.00001)
model.compile(loss=rmse_360_2,
              optimizer=optm)

model.fit(x_train, y_train, validation_data=(x_valid, y_valid), batch_size=32, epochs=50)
# score = model.evaluate(x_valid, y_valid, batch_size=16)

Train on 3053 samples, validate on 127 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fa082dee908>