Load the package you are going to use

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from keras.models import Sequential
from keras.layers import Dense, InputLayer, Input, Dropout, LSTM, Activation, Bidirectional, GRU
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.model_selection import KFold

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import metrics

Using TensorFlow backend.


Load dna2vec

In [2]:
from dna2vec.multi_k_model import MultiKModel

filepath = 'dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v'
mk_model = MultiKModel(filepath)

Load and split data

In [3]:
df = pd.read_csv("train.csv")

In [4]:
from sklearn.model_selection import train_test_split
train_data,test_data = train_test_split(df,test_size=0.2,random_state=42)

In [5]:
X_train = train_data.drop(['label', 'id'], axis=1)
X_train.head()
X_train = X_train.values

y_train = train_data[['label']]
y_train.head()

Unnamed: 0,label
968,1
240,1
819,1
692,1
420,1


In [6]:
X_test = test_data.drop(['label', 'id'], axis=1)
X_test.head()
X_test = X_test.values

y_test = test_data[['label']]
y_test.head()

Unnamed: 0,label
1860,0
353,1
1333,0
905,1
1289,0


Convert sequences to kmers representation

In [7]:
def getKmers(sequence, k=5, stride=2):
    return [sequence[x:x+k] for x in range(0,len(sequence) - k + stride,2)]

getKmers("GCGGGGCGAGCCTC")

['GCGGG', 'GGGGC', 'GGCGA', 'CGAGC', 'AGCCT', 'CCTC']

In [14]:
k = 5
stride = 2
L = len(X_train[0,0])
N = int(np.ceil((L - k) / stride + 1))
vec_dim = 100

In [None]:
def Sequences2Vec(X):            
    m = X.shape[0]
    X_vec = np.zeros((m, N, vec_dim))
    for i in range(m):
        words = getKmers(X[i,0])
        j = 0
        for word in words:
            vec = mk_model.vector(word)        
            X_vec[i][j] = vec
            j+=1            
    return X_vec

In [8]:
from sklearn.pipeline import TransformerMixin
from sklearn.base import BaseEstimator

class SequencesToVecs(BaseEstimator, TransformerMixin):
    """ Convert sequences to vecs """
    def __init__(self,  **kwargs):
        super().__init__(**kwargs)       
    def fit(self, texts, y=None):        
        return self
    
    def transform(self, X, y=None):
        m = X.shape[0]
        X_vec = np.zeros((m, N, vec_dim))
        for i in range(m):
            words = getKmers(X[i,0])
            j = 0
            for word in words:
                vec = mk_model.vector(word)        
                X_vec[i][j] = vec
                j+=1            
        return X_vec
        
sequencer = SequencesToVecs()

In [None]:
X_vec_train = Sequences2Vec(X_train)

Creat a RNN model

In [9]:
def tfbs_LSTM(input_shape=[6,100]):
    # create the model
    model = Sequential()
    model.add(LSTM(32, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(32, return_sequences=True))
    model.add(LSTM(32))
    model.add(Dense(1, activation='sigmoid'))    
    # compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [10]:
tfbs_lstm = KerasClassifier(tfbs_LSTM, epochs=40, batch_size=32, verbose=0)

In [15]:
# Build the Scikit-learn pipeline
pipeline = make_pipeline(sequencer, tfbs_lstm)

In [16]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('sequencestovecs', SequencesToVecs()), ('kerasclassifier', <keras.wrappers.scikit_learn.KerasClassifier object at 0x7f4db434ef98>)])

In [17]:
print('Computing predictions on test set...')
y_preds = pipeline.predict(X_test)

print('Test accuracy: {:.2f} %'.format(100*metrics.accuracy_score(y_preds, y_test)))

Computing predictions on test set...
Test accuracy: 87.00 %


In [None]:
metrics.roc_auc_score(y_test, y_preds)

Another model

In [24]:
def tfbs_LSTM_Dropout(input_shape=[6,100]):
    model = Sequential()
    model.add(InputLayer(input_shape))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(128))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))    
    # compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [25]:
tfbs_lstm_dropout = KerasClassifier(tfbs_LSTM_Dropout, epochs=40, batch_size=32, verbose=0)

In [26]:
# Build the Scikit-learn pipeline
pipeline = make_pipeline(sequencer, tfbs_lstm_dropout)
# Fit model
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('sequencestovecs', SequencesToVecs()), ('kerasclassifier', <keras.wrappers.scikit_learn.KerasClassifier object at 0x7f4d95f515c0>)])

In [27]:
print('Computing predictions on test set...')
y_preds = pipeline.predict(X_test)

print('Test accuracy: {:.2f} %'.format(100*metrics.accuracy_score(y_preds, y_test)))

Computing predictions on test set...
Test accuracy: 87.25 %


In [28]:
metrics.roc_auc_score(y_test, y_preds)

0.8726593164829121

Bidirectional model

In [29]:
def tfbs_BiLSTM(input_shape=[6,100]):
    model = Sequential()
    model.add(InputLayer(input_shape))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))    
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dropout(0.5))    
    model.add(Dense(1))    
    model.add(Activation('sigmoid'))
    # compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [30]:
tfbs_bilstm = KerasClassifier(tfbs_BiLSTM, epochs=40, batch_size=32, verbose=0)

In [31]:
# Build the Scikit-learn pipeline
pipeline = make_pipeline(sequencer, tfbs_bilstm)
# Fit model
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('sequencestovecs', SequencesToVecs()), ('kerasclassifier', <keras.wrappers.scikit_learn.KerasClassifier object at 0x7f4d569d8748>)])

In [32]:
print('Computing predictions on test set...')
y_preds = pipeline.predict(X_test)

print('Test accuracy: {:.2f} %'.format(100*metrics.accuracy_score(y_preds, y_test)))

Computing predictions on test set...
Test accuracy: 87.50 %


In [None]:
metrics.roc_auc_score(y_test, y_preds)

Bidirectional GRUs

In [33]:
def tfbs_BiGRU(input_shape=[6,100]):
    model = Sequential()
    model.add(InputLayer(input_shape))
    model.add(Bidirectional(GRU(128, return_sequences=True)))    
    model.add(Dropout(0.5))
    model.add(Bidirectional(GRU(128)))
    model.add(Dropout(0.5))    
    model.add(Dense(1))    
    model.add(Activation('sigmoid'))
    # compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [34]:
tfbs_bigru = KerasClassifier(tfbs_BiGRU, epochs=40, batch_size=32, verbose=0)

In [35]:
# Build the Scikit-learn pipeline
pipeline = make_pipeline(sequencer, tfbs_bigru)
# Fit model
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('sequencestovecs', SequencesToVecs()), ('kerasclassifier', <keras.wrappers.scikit_learn.KerasClassifier object at 0x7f4d569d8470>)])

In [36]:
print('Computing predictions on test set...')
y_preds = pipeline.predict(X_test)

print('Test accuracy: {:.2f} %'.format(100*metrics.accuracy_score(y_preds, y_test)))

Computing predictions on test set...
Test accuracy: 88.25 %


In [39]:
tfbs_bigru = KerasClassifier(tfbs_BiGRU, epochs=40, batch_size=32, verbose=1)
# Build the Scikit-learn pipeline
pipeline = make_pipeline(sequencer, tfbs_bigru)
# Fit model
pipeline.fit(X_train, y_train)

print('Computing predictions on test set...')
y_preds = pipeline.predict(X_test)

print('Test accuracy: {:.2f} %'.format(100*metrics.accuracy_score(y_preds, y_test)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Computing predictions on test set...
Test accuracy: 85.25 %


In [41]:
tfbs_bigru = KerasClassifier(tfbs_BiGRU, epochs=50, batch_size=200, verbose=1)
# Build the Scikit-learn pipeline
pipeline = make_pipeline(sequencer, tfbs_bigru)
# Fit model
pipeline.fit(X_train, y_train)

print('Computing predictions on test set...')
y_preds = pipeline.predict(X_test)

print('Test accuracy: {:.2f} %'.format(100*metrics.accuracy_score(y_preds, y_test)))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Computing predictions on test set...
Test accuracy: 87.75 %


In [None]:
# We choose a sample from test set
print('Probability(positive) ='+ str(pipeline.predict_proba(X_test[0:10])))
print('True class: ' + str(y_test[0:10]))