# Using LSTM's - UTRs of varying length

In this notebook, we will implement an RNN model on the data, evaluate and analyze the output. 

In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import  Dense, Embedding, LSTM, Bidirectional
from keras.layers import Dropout
from keras.layers.core import Dense
from keras.layers.core import Dropout
from keras.layers.core import Activation
from keras.layers.core import Flatten
from sklearn.feature_extraction.text import CountVectorizer

## 1. We read the preprocessed data. 

In [2]:
#We load the data
data_path = "Data/GSM4084997_varying_length_PREPROCESSED.csv.gz"
df = pd.read_csv(data_path, compression='gzip')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,utr,set,0,1,2,3,4,5,6,...,r8,r9,r10,r11,r12,total,rl,len,one-hot encoding,scaled_rl
0,30,CCTTCAATGATTACCTCTATCCCCA,random,291,448,539,529,527,338,170,...,0.035287,0.040289,0.049426,0.059276,0.113321,0.001592,4.300281,25,"[[0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0,...",-0.743234
1,3101,CTACGACGCTCCGACGTTCAACCCGCT,random,172,171,201,191,463,488,288,...,0.078099,0.071641,0.076364,0.072164,0.083046,0.001349,5.65686,27,"[[0, 1, 0, 0], [0, 0, 0, 1], [1, 0, 0, 0], [0,...",0.272356
2,21140,CCACATTCTCGGCCCCAATGCTCCACTC,random,337,392,560,447,417,309,109,...,0.036651,0.042373,0.054526,0.047254,0.122898,0.001448,4.150751,28,"[[0, 1, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0,...",-0.855179
3,60015,TACGTTTTGACCTTCGTTCATTTTG,random,210,177,181,200,421,478,297,...,0.077419,0.071114,0.076663,0.071114,0.071365,0.00134,5.645849,25,"[[0, 0, 0, 1], [1, 0, 0, 0], [0, 1, 0, 0], [0,...",0.264114
4,75065,TCCAAAAGTACATTCCATATTCTCCA,random,230,195,167,144,327,432,297,...,0.09371,0.068056,0.084204,0.084574,0.059441,0.001312,5.856299,26,"[[0, 0, 0, 1], [0, 1, 0, 0], [0, 1, 0, 0], [1,...",0.421665


In [None]:
# Splitting the human dataset into the training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y_data, 
                                                    test_size = 0.20, 
                                                    random_state=42)

In [None]:
forward_lstm = LSTM(units=320, return_sequences=True)
# backward_lstm = LSTM(input_dim=320, output_dim=320, return_sequences=True)
brnn = Bidirectional(forward_lstm)

print 'building model'

model = Sequential()
model.add(Convolution1D(activation="relu", 
                        input_shape=(1000, 4), 
                        padding="valid", strides=1, 
                        filters=320, kernel_size=26))

model.add(MaxPooling1D(strides=13, pool_size=13))

model.add(Dropout(0.2))

model.add(brnn)

model.add(Dropout(0.5))

model.add(Flatten())

model.add(Dense(input_dim=75*640, units=925))
model.add(Activation('relu'))

model.add(Dense(input_dim=925, units=919))
model.add(Activation('sigmoid'))

print 'compiling model'
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['cosine'])

print 'running at most 60 epochs'

checkpointer = ModelCheckpoint(filepath="DanQ_bestmodel.hdf5", verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

model.fit(X_train, y_train, batch_size=2048, epochs=60, 
          shuffle=True,
          validation_data=(np.transpose(validmat['validxdata'],axes=(0,2,1)), validmat['validdata']), 
          callbacks=[checkpointer,earlystopper])


## 2. We pad the sequences to the longest sequences.

In [12]:
#We find the max length 
max_len = df["len"].max()
max_len

100

In [None]:
#We pad the rest of the sequences to the max length

## 2. Get k-mers

In [4]:
def getKmers(sequence, size = 10):
    return [sequence[x:x+size].upper() for x in range(len(sequence) - size + 1)]

In [5]:
df['words'] = df['utr'].apply(lambda x: getKmers(x))
df['words'].head()

0    [CCTTCAATGA, CTTCAATGAT, TTCAATGATT, TCAATGATT...
1    [CTACGACGCT, TACGACGCTC, ACGACGCTCC, CGACGCTCC...
2    [CCACATTCTC, CACATTCTCG, ACATTCTCGG, CATTCTCGG...
3    [TACGTTTTGA, ACGTTTTGAC, CGTTTTGACC, GTTTTGACC...
4    [TCCAAAAGTA, CCAAAAGTAC, CAAAAGTACA, AAAAGTACA...
Name: words, dtype: object

In [6]:
df_text = list(df['words'])
for item in range(len(df_text)):
    df_text[item] = ' '.join(df_text[item])
y_data = df['rl'].values 

In [7]:
cv = CountVectorizer(ngram_range=(3,3)) #ngram_range=(4,4))
X = cv.fit_transform(df_text)

In [8]:
counts = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

In [9]:
counts

Unnamed: 0,aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa,aaaaaaaaaa aaaaaaaaaa aaaaaaaaac,aaaaaaaaaa aaaaaaaaaa aaaaaaaaag,aaaaaaaaaa aaaaaaaaaa aaaaaaaaat,aaaaaaaaaa aaaaaaaaac aaaaaaaaca,aaaaaaaaaa aaaaaaaaac aaaaaaaacc,aaaaaaaaaa aaaaaaaaac aaaaaaaacg,aaaaaaaaaa aaaaaaaaac aaaaaaaact,aaaaaaaaaa aaaaaaaaag aaaaaaaaga,aaaaaaaaaa aaaaaaaaag aaaaaaaagc,...,tttttttttt tttttttttc ttttttttcg,tttttttttt tttttttttc ttttttttct,tttttttttt tttttttttg ttttttttga,tttttttttt tttttttttg ttttttttgc,tttttttttt tttttttttg ttttttttgg,tttttttttt tttttttttg ttttttttgt,tttttttttt tttttttttt ttttttttta,tttttttttt tttttttttt tttttttttc,tttttttttt tttttttttt tttttttttg,tttttttttt tttttttttt tttttttttt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101721,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101722,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101724,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2. We split the data in training, validation and testing set.

In [11]:
# Splitting the human dataset into the training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y_data, 
                                                    test_size = 0.20, 
                                                    random_state=42)

## 3. We build our model and train it using our datasets.

In [None]:
vocab_size = len(tokenizer.word_index) + 1
