In [1]:
import tarfile
import pandas as pd
import numpy as np
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv1D, MaxPooling1D

from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

from matplotlib import pyplot as plt

Using TensorFlow backend.


### Read data

In [2]:
filename = "training_data.tar.gz"

data = tarfile.open(filename, "r:gz")
data.extractall()
data.close()

In [3]:
b = open('ghl_gold.fa','r')
bind = b.readlines()
b.close()

u = open('ghl_gold_random.fa','r')
unbind = u.readlines()
u.close()

### Data preprocessing

In [4]:
bind = [v for v in bind if '>' not in v]
bind = [s.replace('\n', '') for s in bind]
bind = [x.upper() for x in bind]

unbind = [v for v in unbind if '>' not in v]
unbind = [s.replace('\n', '') for s in unbind]
unbind = [x.upper() for x in unbind]

In [5]:
import random
test_bind = random.sample(bind, 100000)
test_unbind = random.sample(unbind, 100000)

In [6]:
df = pd.DataFrame({'seq':[], 'label':[]})

In [7]:
LE = LabelEncoder()
LE.fit(['A', 'C', 'G', 'T', 'N'])

LabelEncoder()

In [None]:
from datetime import datetime

start = datetime.now()

for i in range(len(bind)):
  label_encoded = LE.transform(list(bind[i]))
  df = df.append({'seq':label_encoded, 'label':1}, ignore_index=True)

for i in range(len(unbind)):
  label_encoded = LE.transform(list(unbind[i]))
  df = df.append({'seq':label_encoded, 'label':0}, ignore_index=True)
    
end = datetime.now()
print(end-start)

##### split the dataset

In [None]:
from sklearn.utils import shuffle

new_df = shuffle(df)

In [None]:
x = new_df.seq
y = new_df.label

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=40)

In [None]:
x_train = to_categorical(x_train.values.tolist())
x_test = to_categorical(x_test.values.tolist())

y_train = to_categorical(y_train.values.tolist())
y_t = to_categorical(y_test.values.tolist())

### CNN model

In [None]:
model=Sequential()
model.add(Conv1D(filters=64, kernel_size=3, strides=1, padding='valid', input_shape=(20,5), activation='relu'))
model.add(MaxPooling1D(pool_size=3, strides=1, padding='valid'))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(x_train, y_train, epochs=200, validation_split = 0.15)

##### Evaluation

Accuracy

In [None]:
score = model.evaluate(x_test, y_t, verbose=1)
print("score = " + str(round(score[1],2)))

loss-epoch curve

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='upper right')
plt.show()

precision-recall curve

In [None]:
probs = model.predict(x_test, verbose=0)[:,1]

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test.values, probs)

In [None]:
plt.plot(recall, precision)

plt.title('Precision-Recall Curve')

plt.xlabel('Recall')
plt.ylabel('Precision')

# plt.legend()
plt.show()

ROC curve & AUC

In [None]:
auc = roc_auc_score(y_test.values, probs)
fpr, tpr, _ = roc_curve(y_test.values, probs)

plt.plot(fpr, tpr)
plt.title('ROC Curve (AUC = ' + str(round(auc,2)) + ')')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()