In [3]:
import tensorflow as tf
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import os
from ipywidgets import FileUpload

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
config = tf.compat.v1.ConfigProto(device_count={'CPU':20}, intra_op_parallelism_threads=8,
      inter_op_parallelism_threads=40, log_device_placement=True)
sess = tf.compat.v1.Session(config=config)

In [4]:
f1 = open("../Basset/data/er.fa.lines", "r")
sequences = f1.read().split('\n')
sequences = list(filter(None, sequences))  # This removes empty sequences.

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# The LabelEncoder encodes a sequence of bases as a sequence of integers.
integer_encoder = LabelEncoder()  
# The OneHotEncoder converts an array of integers to a sparse matrix where 
# each row corresponds to one possible value of each feature.
one_hot_encoder = OneHotEncoder(categories='auto')   
input_features = []

for sequence in sequences:
    integer_encoded = integer_encoder.fit_transform(list(sequence))
    integer_encoded = np.array(integer_encoded).reshape(-1, 1)
    one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
    
    #filtering out the sequences with shorter pandas
    if (len(one_hot_encoded.toarray()[0])==4):
        input_features.append(one_hot_encoded.toarray())

np.set_printoptions(threshold=40)
print("Example sequence\n-----------------------")
print('DNA Sequence #1:\n',sequences[0][:10],'...',sequences[0][-10:])
print('One hot encoding of Sequence #1:\n',input_features[0].T)

input_features = np.stack(input_features)
np.save('er.fa',input_features)

Example sequence
-----------------------
DNA Sequence #1:
 TAGTAAAAAA ... ATATGCAAGA
One hot encoding of Sequence #1:
 [[0. 1. 0. ... 1. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 1. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


OSError: Not enough free space to write 38817504000 bytes

In [None]:
f2 = open("../Basset/data/er_act.lines", "r")
labels = f2.read().split('\n')
labels = list(filter(None, labels))  # removes empty sequences

#filtering out the sequences with shorter pandas
labels = labels[0:len(input_features)]

one_hot_encoder = OneHotEncoder(categories='auto')
labels = np.array(labels).reshape(-1, 1)
input_labels = one_hot_encoder.fit_transform(labels).toarray()
np.save('er_act',input_labels)

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
integer_encoder = LabelEncoder()  
one_hot_encoder = OneHotEncoder(categories='auto')   
input_features = []
input_features = np.load('sequences.npy')
input_labels = np.load('labels.npy')

In [None]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(
    input_features, input_labels, test_size=0.25, random_state=42)

In [None]:
from tensorflow.keras.layers import Conv1D, Dense, BatchNormalization, MaxPooling1D, Dropout, Flatten
from tensorflow.keras.models import Sequential

with sess:
    
    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=12, input_shape=(train_features.shape[1], 4)))
    model.add(MaxPooling1D(pool_size=4))
    model.add(Flatten())
    model.add(Dense(16, activation='relu'))
    model.add(Dense(2, activation='softmax')) 
    model.compile(loss='binary_crossentropy', optimizer='adam', 
                  metrics=['binary_accuracy'])
    
    history = model.fit(train_features, train_labels, batch_size=100,
                                        epochs=50, verbose=0, validation_split=0.25)

    plt.figure()
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'])
    plt.show()
   


In [None]:
from tensorflow.keras.layers import Conv2D, Dense, MaxPooling1D, BatchNormalization, Dropout, Flatten, Conv1D, Conv3D, Activation
from tensorflow.keras.models import Sequential

model = Sequential() 
model.add(Conv1D(filters=16, kernel_size= 8, input_shape=(train_features.shape[1], 4))) 
model.add(Activation('relu')) 
model.add(Dropout(0.4))
model.add(Conv1D(filters=16, kernel_size= 8))
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=4, strides= 2))
model.add(Dropout(0.4))
model.add(Flatten())
model.add(Dense(64))
model.add(Dropout(0.4))
model.add(Dense(2))
model.add(Activation('sigmoid')) 

In [None]:
model.predict(np.stack(test_features))

In [None]:
from ipywidgets import FileUpload
upload = FileUpload()
upload

In [None]:
[uploaded_file] = upload.value
seqbytes=str(upload.value.get(list(upload.value.keys())[0]).get('content')).replace('b','')

In [None]:
import re
seqarray = re.split("\\\\n", seqbytes)
seqarray.pop()

In [None]:
# The LabelEncoder encodes a sequence of bases as a sequence of integers.
integer_encoder = LabelEncoder()  
# The OneHotEncoder converts an array of integers to a sparse matrix where 
# each row corresponds to one possible value of each feature.
one_hot_encoder = OneHotEncoder(categories='auto')   
input_features = []

for sequence in seqarray:
  integer_encoded = integer_encoder.fit_transform(list(sequence))
  integer_encoded = np.array(integer_encoded).reshape(-1, 1)
  one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
  input_features.append(one_hot_encoded.toarray())

np.set_printoptions(threshold=40)
input_features = np.stack(input_features)
print("Example sequence\n-----------------------")
print('DNA Sequence #1:\n',sequences[0][:10],'...',sequences[0][-10:])
print('One hot encoding of Sequence #1:\n',input_features[0].T)