Make sure you extracted following files into the same directory as this notebook:
- training_50_dih.joblib
- validation_dih.joblib
- le.joblib
- ohe.joblib

And run 'pip install joblib'

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import tensorflow as tf
import time

In [2]:
def dihedral(p):
    """Praxeolitic formula
    1 sqrt, 1 cross product"""
    p0 = p[0]
    p1 = p[1]
    p2 = p[2]
    p3 = p[3]

    b0 = -1.0*(p1 - p0)
    b1 = p2 - p1
    b2 = p3 - p2

    # normalize b1 so that it does not influence magnitude of vector
    # rejections that come next
    b1 /= np.linalg.norm(b1)

    # vector rejections
    # v = projection of b0 onto plane perpendicular to b1
    #   = b0 minus component that aligns with b1
    # w = projection of b2 onto plane perpendicular to b1
    #   = b2 minus component that aligns with b1
    v = b0 - np.dot(b0, b1)*b1
    w = b2 - np.dot(b2, b1)*b1

    # angle between v and w in a plane is the torsion angle
    # v and w may not be normalized but that's fine since tan is y/x
    x = np.dot(v, w)
    y = np.dot(np.cross(b1, v), w)
    return np.arctan2(y, x)

Modify the path below

In [3]:
data_order = ["[ID]", "[PRIMARY]", "[EVOLUTIONARY]", "[TERTIARY]", "[MASK]"]

# change this path to match the path of where you have the
# training_50 and validation files
txt_data_path = '/home/mikey/Data/ProteinNet/casp7_txt/'

In [14]:
from tqdm import tqdm
from sklearn.externals import joblib

def count_protein(raw_txt_data):
    data = filter_line_end(raw_txt_data)
    result = 0
    for line in data:
        if line == data_order[0]: #id
            result += 1
    return result

def filter_line_end(data):
    return [str_.replace('\n', '') for str_ in data]

def get_primary_from_all_data(data, lim):
    result = []
    protein_count = 0
    flag = False
    for line in data:
        if line == data_order[2]: #evolutionary
            flag = False
        if flag:
            result.append(line)
        if line == data_order[1]: #primary
            flag = True
        if line == data_order[0]: #id
            protein_count += 1
        if lim and protein_count > lim:
            return result
    return result

def get_evolutionary_from_all_data(data, lim):
    result = []
    protein_count = 0
    flag = False
    for line in data:
        if line == data_order[-2]: #mask
            flag = False
        if flag:
            result.append(line)
        if line == data_order[-3]: #tertiary
            flag = True
        if line == data_order[0]: #id
            protein_count += 1
        if lim and protein_count > lim:
            return result
    return result

def get_tertiary_from_all_data(data, lim):
    result = []
    protein_count = 0
    flag = False
    for line in data:
        if line == data_order[-1]: #mask
            flag = False
        if flag:
            result.append(line)
        if line == data_order[-2]: #tertiary
            flag = True
        if line == data_order[0]: #id
            protein_count += 1
        if lim and protein_count > lim:
            return result
    return result

def group_aminoacids_together(data, every_n):
    data_expanded = [np.expand_dims(t.split('\t'), 1) for t in data]
    result = []
    for i in tqdm(range(0,len(data_expanded),every_n)):
        # group together every_n entries (e.g. 3 for tertiary and 21 for evo)
        result.append(np.concatenate([data_expanded[i+r] for r in range(every_n)], axis=1))
        
    return result

def parse_tertiary_from_file(path, data_lim=None):
    with open(path) as f:
        data = f.readlines()
    
    data_ = filter_line_end(data[:data_lim])        
    only_tertiary = get_tertiary_from_all_data(data_, data_lim)
    return group_aminoacids_together(only_tertiary, every_n=3)

def parse_evolutionary_from_file(path, data_lim=None):
    with open(txt_data_path + file_name) as f:
        data = f.readlines()

    data_ = filter_line_end(data)
    print("Loaded data and filtered line endings")
    only_evo = get_evolutionary_from_all_data(data_, data_lim)
    print("Extracted evolutionary data")
    res = group_aminoacids_together(only_evo, every_n = 21)
    print("Grouped 21's together")
    return res

def parse_primary_from_file(path, data_lim=None):
    with open(txt_data_path + file_name) as f:
        data = f.readlines()
    
    data_ = filter_line_end(data)
    print("Loaded data and filtered line endings")
    primary = get_primary_from_all_data(data_, data_lim)
    print("Extracted primary data")
    le = load_file('le.joblib')
    ohe = load_file('ohe.joblib')
    primary_in_floats = [le.transform([_ for _ in c]) for c in primary]
    primary_encoded = [ohe.transform(a.reshape(-1,1)).toarray() for a in primary_in_floats]
    print("Encoded primary sequences")
    return primary_encoded

def get_dih(protein_tertiary):
    p = protein_tertiary
    r = p.shape[0]
    a_list = list(range(r))
    the_list = np.array([a_list[slice(i, i+4)] for i in range(r - 4+1)])
    slices = np.asarray(p[the_list], dtype=np.float32)
    one_dih = np.array([dihedral(slice_) for slice_ in slices])
    one_dih = np.insert(one_dih, 0, None)
    one_dih = np.append(one_dih, [None,None])
    return one_dih.reshape(-1,3)

def save_file(data, path):
    joblib.dump(data, path) 
    
def load_file(path):
    return joblib.load(path)

This was used to generate the dihedral angles

In [None]:
# file_name = 'training_50'
# # file_name = 'validation'
# tertiary = parse_tertiary_from_file(txt_data_path + file_name)

# dih = []
# for protein in tqdm(tertiary):
#     dih.append(get_dih(protein))
# save_file(dih, file_name + '_dih.joblib')

100%|██████████| 13024/13024 [00:00<00:00, 43820.84it/s]
  from ipykernel import kernelapp as app
 79%|███████▉  | 10303/13024 [04:20<01:15, 36.18it/s]

The data_lim is important because so far the full 13000 proteins crashes my system with 16Gb of ram

Load training data

In [15]:
file_name = 'training_50'
data_lim = 5000
prim_train, evo_train, dih_train = (parse_primary_from_file(txt_data_path + file_name, data_lim), 
                                    parse_evolutionary_from_file(txt_data_path + file_name, data_lim),
                                    load_file('./'+file_name+'_dih.joblib'))
dih_train = dih_train[:data_lim]
len(prim_train), prim_train[0].shape, len(evo_train), evo_train[0].shape, len(dih_train), dih_train[0].shape

Loaded data and filtered line endings
Extracted primary data
Encoded primary sequences
Loaded data and filtered line endings
Extracted evolutionary data


100%|██████████| 5000/5000 [00:00<00:00, 13589.33it/s]


Grouped 21's together


(5000, (70, 20), 5000, (70, 21), 5000, (70, 3))

Load validation data

In [16]:
file_name = 'validation'
prim_valid, evo_valid, dih_valid = (parse_primary_from_file(txt_data_path + file_name), 
                                    parse_evolutionary_from_file(txt_data_path + file_name),
                                    load_file('./'+ file_name + '_dih.joblib'))
len(prim_valid), prim_valid[0].shape, len(evo_valid), evo_valid[0].shape, len(dih_valid), dih_valid[0].shape

Loaded data and filtered line endings
Extracted primary data
Encoded primary sequences
Loaded data and filtered line endings
Extracted evolutionary data


100%|██████████| 224/224 [00:00<00:00, 9211.02it/s]

Grouped 21's together





(224, (269, 20), 224, (269, 21), 224, (269, 3))

Pad the data and limit protein length. Prepare the data for training

In [59]:
def fix_nans(a):
    where_are_NaNs = np.isnan(a)
    a[where_are_NaNs] = 0
    return a

def pad_array(array, max_len):
    return np.asarray([np.pad(a, [(max_len - len(a), 0), (0,0)], mode='constant') for a in array], dtype=np.float32)

def limit_length_and_pad(prim, evo, dih, max_length):
    mask = np.array([len(el) for el in prim]) <= max_length
    prim_lim, evo_lim, dih_lim = np.array(prim)[mask], np.array(evo)[mask], np.array(dih)[mask]
#     max_ = np.max([len(a) for a in prim_lim])
    prim_pad, evo_pad, dih_pad = pad_array(prim_lim, max_length), pad_array(evo_lim, max_length), pad_array(dih_lim, max_length)
    return fix_nans(prim_pad), fix_nans(evo_pad), fix_nans(dih_pad)

max_len = 200
prim_, evo_, dih_ = limit_length_and_pad(prim_train, evo_train, dih_train, max_len)
x_train, y_train = np.concatenate([prim_, evo_], axis=2), dih_
prim_v, evo_v, dih_v = limit_length_and_pad(prim_valid, evo_valid, dih_valid, max_len)
x_valid, y_valid = np.concatenate([prim_v, evo_v], axis=2), dih_v

Run a simple regression task to check if it trains on the data

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Multiply, Lambda
# from keras.layers import Embedding
from keras.layers import LSTM

# max_features = 1024

model = Sequential()
# model.add(Embedding(max_features, output_dim=256))
model.add(LSTM(32, return_sequences=True))
model.add(LSTM(32, return_sequences=True))
# model.add(Dropout(0.5))
model.add(Dense(3, activation='tanh'))
model.add(Lambda(lambda x: x*np.pi))

model.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics=['mean_absolute_error'])

model.fit(x_train, y_train, validation_data=(x_valid, y_valid), batch_size=32, epochs=40)
# score = model.evaluate(x_valid, y_valid, batch_size=16)

Train on 3053 samples, validate on 127 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
 192/3053 [>.............................] - ETA: 28s - loss: 2.3166 - mean_absolute_error: 0.9431