In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from statsmodels.tsa.stattools import acovf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D

In [45]:
df = pd.read_csv("../data/train_cleaned.csv")
df.head()

Unnamed: 0,interacts,seq1,seq2
0,1,MHKTASQRLFPGPSYQNIKSIMEDSTILSDWTNSNKQKMKYDFSCE...,MERRRITSAARRSYVSSGEMMVGGLAPGRRLGPGTRLSLARMPPPL...
1,1,MPYNFCLPSLSCRTSCSSRPCVPPSCHGYTLPGACNIPANVSNCNW...,MSQAYSSSQRVSSYRRTFGGAPGFPLGSPLSSPVFPRAGFGSKGSS...
2,1,MSFSEMNRRTLAFRGGGLVTASGGGSTNNNAGGEASAWPPQPQPRQ...,MALCLKQVFAKDKTFRPRKRFEPGTQRFELYKKAQASLKSGLDLRS...
3,1,MKFQYKEDHPFEYRKKEGEKIRKKYPDRVPVIVEKAPKARVPDLDK...,MEPQVTLNVTFKNEIQSFLVSDPENTTWADIEAMVKVSFDLNTIQI...
4,1,MTILGTTFGMVFSLLQVVSGESGYAQNGDLEDAELDDYSFSCYSQL...,MRVAGAAKLVVAVAVFLLTFYVISQVFEIKMDASLGNLFARSALDT...


In [46]:
aa_prop_dict = {
     "A": [0.62, -0.5, 0.007187, 8.1, 0.046, 1.181, 27.5],
     "C": [0.29, -1, -0.03661, 5.5, 0.128, 1.461, 44.6],
     "D": [-0.9, 3, -0.02382, 13, 0.105, 1.587, 40],
     "E": [-0.74, 3, 0.006802, 12.3, 0.151, 1.862, 62],
     "F": [1.19, -2.5, 0.037552, 5.2, 0.29, 2.228, 115.5],
     "G": [0.48, 0, 0.179052, 9, 0, 0.881, 0],
     "H": [-0.4, -0.5, -0.01069, 10.4, 0.23, 2.025, 79],
     "I": [1.38, -1.8, 0.021631, 5.2, 0.186, 1.81, 93.5],
     "K": [-1.5, 3, 0.017708, 11.3, 0.219, 2.258, 100],
     "L": [1.06, -1.8, 0.051672, 4.9, 0.186, 1.931, 93.5],
     "M": [0.64, -1.3, 0.002683, 5.7, 0.221, 2.034, 94.1],
     "N": [-0.78, 2, 0.005392, 11.6, 0.134, 1.655, 58.7],
     "P": [0.12, 0, 0.239531, 8, 0.131, 1.468, 41.9],
     "Q": [-0.85, 0.2, 0.049211, 10.5, 0.18, 1.932, 80.7],
     "R": [-2.53, 3, 0.043587, 10.5, 0.291, 2.56, 105],
     "S": [-0.18, 0.3, 0.004627, 9.2, 0.062, 1.298, 29.3],
     "T": [-0.05, -0.4, 0.003352, 8.6, 0.108, 1.525, 51.3],
     "V": [1.08, -1.5, 0.057004, 5.9, 0.14, 1.645, 71.5],
     "W": [0.81, -3.4, 0.037977, 5.4, 0.409, 2.663, 145.5],
     "Y": [0.26, -2.3, 117.3, 6.2, 0.298, 2.368, 0.023599]
}

In [47]:
aa_props = pd.DataFrame(aa_prop_dict)

In [48]:
aa_props = aa_props.T

In [49]:
aa_props.head()

Unnamed: 0,0,1,2,3,4,5,6
A,0.62,-0.5,0.007187,8.1,0.046,1.181,27.5
C,0.29,-1.0,-0.03661,5.5,0.128,1.461,44.6
D,-0.9,3.0,-0.02382,13.0,0.105,1.587,40.0
E,-0.74,3.0,0.006802,12.3,0.151,1.862,62.0
F,1.19,-2.5,0.037552,5.2,0.29,2.228,115.5


In [50]:
lag = 30

In [51]:
columns = ["hydrophobicity", "hydrophilicity", "net_charge_index", "polarity", "polarizability", "solvent_accessible_surface_area", "volume"]
aa_props.columns = columns
aa_props.head()

Unnamed: 0,hydrophobicity,hydrophilicity,net_charge_index,polarity,polarizability,solvent_accessible_surface_area,volume
A,0.62,-0.5,0.007187,8.1,0.046,1.181,27.5
C,0.29,-1.0,-0.03661,5.5,0.128,1.461,44.6
D,-0.9,3.0,-0.02382,13.0,0.105,1.587,40.0
E,-0.74,3.0,0.006802,12.3,0.151,1.862,62.0
F,1.19,-2.5,0.037552,5.2,0.29,2.228,115.5


In [52]:
def seq_to_props(seq, prop):
    prop_arr = []
    for i in seq:
        prop_arr.append(aa_props.loc[i,prop])
    return prop_arr

In [53]:
def generate_timeseries(row):
    feature_vector = []
    for seq in ["seq1","seq2"]:
        seq_vector = []
        for prop in columns:
            autocovariances = acovf(seq_to_props(row[seq],prop),nlag=lag)
            seq_vector.append(autocovariances])
        feature_vector.append(seq_vector)
    return feature_vector            

In [54]:
df["autocovariances"] = df.apply(generate_timeseries,axis=1)



In [60]:
df.to_csv("../data/autocov_df.csv", index=False)

In [55]:
df.head()

Unnamed: 0,interacts,seq1,seq2,autocovariances
0,1,MHKTASQRLFPGPSYQNIKSIMEDSTILSDWTNSNKQKMKYDFSCE...,MERRRITSAARRSYVSSGEMMVGGLAPGRRLGPGTRLSLARMPPPL...,"[[[0.9794680221719504, 0.016610216791042895, -..."
1,1,MPYNFCLPSLSCRTSCSSRPCVPPSCHGYTLPGACNIPANVSNCNW...,MSQAYSSSQRVSSYRRTFGGAPGFPLGSPLSSPVFPRAGFGSKGSS...,"[[[1.0351499871336143, -0.11668483852807333, 0..."
2,1,MSFSEMNRRTLAFRGGGLVTASGGGSTNNNAGGEASAWPPQPQPRQ...,MALCLKQVFAKDKTFRPRKRFEPGTQRFELYKKAQASLKSGLDLRS...,"[[[0.9310965985228851, 0.08598766826115835, -0..."
3,1,MKFQYKEDHPFEYRKKEGEKIRKKYPDRVPVIVEKAPKARVPDLDK...,MEPQVTLNVTFKNEIQSFLVSDPENTTWADIEAMVKVSFDLNTIQI...,"[[[1.1963975600847394, 0.13297429428956933, 0...."
4,1,MTILGTTFGMVFSLLQVVSGESGYAQNGDLEDAELDDYSFSCYSQL...,MRVAGAAKLVVAVAVFLLTFYVISQVFEIKMDASLGNLFARSALDT...,"[[[0.9142170428277822, 0.011203750327072465, 0..."


In [56]:
features = "autocovariances"
target = "interacts"

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [57]:
X_train = np.stack(X_train)
X_test = np.stack(X_test)

y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

In [69]:
X_train.shape

(85937, 2, 7, 31)

In [63]:
model = Sequential()

model.add(Flatten(input_shape=(2, 7, 31)))

model.add(Dense(100,
                activation="relu"
               ))

model.add(Dense(1,activation="sigmoid"))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 434)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               43500     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 43,601
Trainable params: 43,601
Non-trainable params: 0
_________________________________________________________________


In [64]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])

In [65]:
history = model.fit(x=X_train, y=y_train, epochs=10, batch_size=512,
                    validation_data=(X_test,y_test)
                   )

Train on 85937 samples, validate on 28646 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [70]:
# # Instantiate a CNN.
# cnn_model = Sequential()

# # Add a convolutional layer.
# cnn_model.add(Conv2D(filters=16,             # number of filters
#                        kernel_size=(3,3),      # height/width of filter
#                        activation='relu',      # activation function 
#                        input_shape=(2, 7, 31))) # shape of input (image)

# # Add a pooling layer.
# cnn_model.add(MaxPooling2D(pool_size=(2,2))) # dimensions of region of pooling

# # Add another convolutional layer.
# cnn_model.add(Conv2D(64,
#                        kernel_size=(3,3),
#                        activation='relu'))

# # Add another pooling layer.
# cnn_model.add(MaxPooling2D(pool_size=(2,2)))

# # We have to remember to flatten to go from the "box" to the vertical line of nodes!
# cnn_model.add(Flatten())

# # Add a densely-connected layer with 64 neurons.
# cnn_model.add(Dense(64, activation='relu'))

# # Let's try to avoid overfitting!
# cnn_model.add(Dropout(0.5))

# # Add a densely-connected layer with 32 neurons.
# cnn_model.add(Dense(32, activation='relu'))

# # Let's try to avoid overfitting!
# cnn_model.add(Dropout(0.5))

# # Add a final layer with 10 neurons.
# cnn_model.add(Dense(1, activation='sigmoid'))

# # Compile model
# cnn_model.compile(loss='categorical_crossentropy',
#                     optimizer='adam',
#                     metrics=['accuracy'])