In [2]:
# Workaround to import from dir above. In this case from "../src"
import sys
sys.path.append("..")

In [3]:
# Autoreload modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Imports

In [113]:
import pandas as pd
import numpy as np
np.random.seed()

from sklearn.preprocessing import LabelBinarizer, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split

import src.utils.text_utils as txt_utils
import src.utils.func_utils as f_utils
import src.utils.graph_utils as g_utils
from src.utils.func_utils import timer
import model.train as train_utils

import tensorflow as tf
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Flatten, Dropout, Activation, GRU, LSTM, TimeDistributed
from keras.layers.normalization import BatchNormalization
from keras.preprocessing.text import Tokenizer
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger, Callback
from keras.wrappers.scikit_learn import KerasClassifier

%matplotlib inline

# Global Variables

In [53]:
data_path = "../preprocessed_data/"
dict_data = {
    1: '746',
    2: '1625',
    3: 'impens',
    4: 'schilling'
}

WORDS = 21
LENGTH = 8
DEPTH = 8

epochs=1000
batch_size=64

folds = 4
runs = 2

cv_LL = 0
cv_AUC = 0
cv_gini = 0
fpred = []
avpred = []
avreal = []
avids = []

patience = 10
batchsize = 64

# Load Data

## 746 Dataset

In [6]:
df_746 = pd.read_csv(data_path+"data_746_preprocessed.csv", sep=';')
df_746.head(2)

Unnamed: 0,octamer,label,0,1,2,3,4,5,6,7,...,150,151,152,153,154,155,156,157,158,159
0,AAAMKRHG,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AAAMSSAI,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 1625 Dataset

In [7]:
df_1625 = pd.read_csv(data_path+"data_1625_preprocessed.csv", sep=';')
df_1625.head(2)

Unnamed: 0,octamer,label,0,1,2,3,4,5,6,7,...,150,151,152,153,154,155,156,157,158,159
0,AECFRIFD,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HLVEALYL,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Impens Dataset

In [8]:
df_impens = pd.read_csv(data_path+"data_impens_preprocessed.csv", sep=';')
df_impens.head(2)

Unnamed: 0,octamer,label,0,1,2,3,4,5,6,7,...,150,151,152,153,154,155,156,157,158,159
0,AAAVDAGM,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,AAGKSGGG,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Schilling Dataset

In [9]:
df_schilling = pd.read_csv(data_path+"data_schilling_preprocessed.csv", sep=';')
df_schilling.head(2)

Unnamed: 0,octamer,label,0,1,2,3,4,5,6,7,...,150,151,152,153,154,155,156,157,158,159
0,AAAAPAKV,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,AAAELGAR,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


# Modeling

## Preprocessing for Embedding

### 746 Dataset

In [10]:
x_train_746 = df_746.iloc[:, 2:]
y_train_746 = df_746.iloc[:, 1]

### 1625 Dataset

In [11]:
x_train_1625 = df_1625.iloc[:, 2:]
y_train_1625 = df_1625.iloc[:, 1]

### Impens Dataset

In [12]:
x_train_impens = df_impens.iloc[:, 2:]
y_train_impens = df_impens.iloc[:, 1]

### Schilling Dataset

In [13]:
x_train_schilling = df_schilling.iloc[:, 2:]
y_train_schilling = df_schilling.iloc[:, 1]

### All Data - 6586

In [14]:
x_train_full = pd.concat([x_train_746, x_train_1625, x_train_impens, x_train_schilling], axis=0, ignore_index=True)
y_train_full = pd.concat([y_train_746, y_train_1625, y_train_impens, y_train_schilling], axis=0, ignore_index=True)

In [15]:
x_train_full, _ = txt_utils.scale_data(x_train_full)

## Keras - Embedding + RNN (LSTM)

In [16]:
dfs = [df_746, df_1625, df_impens, df_schilling]

In [17]:
for i, df in enumerate(dfs):
    df['octamer_string'] = df['octamer'].apply(lambda x: ' '.join(i for i in x))
    df['dataset'] = dict_data[i+1]
    
df_full = pd.concat([df_746, df_1625, df_impens, df_schilling], axis=0)

In [18]:
octamers_full = df_full['octamer_string']
tokenizer = Tokenizer(num_words = WORDS)
tokenizer.fit_on_texts(octamers_full)
sequences_full = tokenizer.texts_to_sequences(octamers_full)
x_train_full_vec = pad_sequences(sequences_full, maxlen = LENGTH)

In [20]:
octamers_746 = df_full[df_full['dataset'] == '746']['octamer_string']
sequences_746 = tokenizer.texts_to_sequences(octamers_746)
octamers_746_vec = pad_sequences(sequences_746, maxlen = LENGTH)

octamers_1625 = df_full[df_full['dataset'] == '1625']['octamer_string']
sequences_1625 = tokenizer.texts_to_sequences(octamers_1625)
octamers_1625_vec = pad_sequences(sequences_1625, maxlen = LENGTH)

octamers_impens = df_full[df_full['dataset'] == 'impens']['octamer_string']
sequences_impens = tokenizer.texts_to_sequences(octamers_impens)
octamers_impens_vec = pad_sequences(sequences_impens, maxlen = LENGTH)

octamers_schilling = df_full[df_full['dataset'] == 'schilling']['octamer_string']
sequences_schilling = tokenizer.texts_to_sequences(octamers_schilling)
octamers_schilling_vec = pad_sequences(sequences_schilling, maxlen = LENGTH)


In [123]:
def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc


def baseline_model():
    model = Sequential()
    model.add(Embedding(WORDS, 8, input_length = LENGTH))
    model.add(LSTM(8, dropout = 0.1, recurrent_dropout = 0.5, return_sequences = True))
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='rmsprop', metrics = [auc, 'accuracy'], loss='binary_crossentropy')

    return model

def init_callbacks(model):
    logger = 'logs/keras-{}.log'.format(model)
    checkpoint = 'checkpoints/{}/{}_weights.h5'.format(model, model)
    callbacks = [
        EarlyStopping(monitor='val_auc', patience=patience, mode='max',
                      restore_best_weights = True, min_delta=0.001, verbose=1),
        CSVLogger(logger, separator=',', append=False),
        ModelCheckpoint(
            checkpoint,
            monitor='val_auc', mode='max',
            save_best_only=True,
            verbose=1
        )
    ]
    return callbacks

def predict_tests(model, dataset_pos, xs, ys):
    xs_copy = xs.copy()
    ys_copy = ys.copy()
    pos = list(range(1, 5))
    pos.pop((dataset_pos-1))
    names = [dict_data[i] for i in pos]
    xs_copy.pop((dataset_pos-1))
    ys_copy.pop((dataset_pos-1))
    for i,(x,y) in enumerate(zip(xs_copy,ys_copy)):
        y_pred = model.predict(x)
        print(names[i] + "\nAUC: " + str(roc_auc_score(y, y_pred)) + "\n")
        
xs = [octamers_746_vec, octamers_1625_vec, octamers_impens_vec, octamers_schilling_vec]
ys = [y_train_746, y_train_1625, y_train_impens, y_train_schilling]

### 746

#### Train

In [124]:
x_train, x_val, y_train, y_val = train_test_split(octamers_746_vec, y_train_746, test_size=0.2, random_state=1, stratify=y_train_746)
model_746 = baseline_model()
model_746.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=init_callbacks('746'), verbose=1)

Train on 596 samples, validate on 149 samples
Epoch 1/1000

Epoch 00001: val_auc improved from -inf to 0.62848, saving model to checkpoints/746/746_weights.h5
Epoch 2/1000

Epoch 00002: val_auc improved from 0.62848 to 0.68536, saving model to checkpoints/746/746_weights.h5
Epoch 3/1000

Epoch 00003: val_auc improved from 0.68536 to 0.70201, saving model to checkpoints/746/746_weights.h5
Epoch 4/1000

Epoch 00004: val_auc improved from 0.70201 to 0.72485, saving model to checkpoints/746/746_weights.h5
Epoch 5/1000

Epoch 00005: val_auc improved from 0.72485 to 0.73902, saving model to checkpoints/746/746_weights.h5
Epoch 6/1000

Epoch 00006: val_auc improved from 0.73902 to 0.75002, saving model to checkpoints/746/746_weights.h5
Epoch 7/1000

Epoch 00007: val_auc improved from 0.75002 to 0.76454, saving model to checkpoints/746/746_weights.h5
Epoch 8/1000

Epoch 00008: val_auc improved from 0.76454 to 0.77543, saving model to checkpoints/746/746_weights.h5
Epoch 9/1000

Epoch 00009: va

<keras.callbacks.History at 0x7f2b84d84908>

#### Test

In [125]:
predict_tests(model_746, 1, xs, ys)

1625
AUC: 0.9122631016042779

impens
AUC: 0.8137310215320875

schilling
AUC: 0.825277480430584



### 1625

#### Train

In [126]:
x_train, x_val, y_train, y_val = train_test_split(octamers_1625_vec, y_train_1625, test_size=0.2, random_state=1, stratify=y_train_1625)
model_1625 = baseline_model()
model_1625.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=init_callbacks('1625'), verbose=1)

Train on 1299 samples, validate on 325 samples
Epoch 1/1000

Epoch 00001: val_auc improved from -inf to 0.53622, saving model to checkpoints/1625/1625_weights.h5
Epoch 2/1000

Epoch 00002: val_auc did not improve from 0.53622
Epoch 3/1000

Epoch 00003: val_auc did not improve from 0.53622
Epoch 4/1000

Epoch 00004: val_auc improved from 0.53622 to 0.54096, saving model to checkpoints/1625/1625_weights.h5
Epoch 5/1000

Epoch 00005: val_auc improved from 0.54096 to 0.56302, saving model to checkpoints/1625/1625_weights.h5
Epoch 6/1000

Epoch 00006: val_auc improved from 0.56302 to 0.58683, saving model to checkpoints/1625/1625_weights.h5
Epoch 7/1000

Epoch 00007: val_auc improved from 0.58683 to 0.60988, saving model to checkpoints/1625/1625_weights.h5
Epoch 8/1000

Epoch 00008: val_auc improved from 0.60988 to 0.63379, saving model to checkpoints/1625/1625_weights.h5
Epoch 9/1000

Epoch 00009: val_auc improved from 0.63379 to 0.65414, saving model to checkpoints/1625/1625_weights.h5
Ep

<keras.callbacks.History at 0x7f2b83bcbbe0>

#### Test

In [127]:
predict_tests(model_1625, 2, xs, ys)

746
AUC: 0.9533020031040135

impens
AUC: 0.8067248827398045

schilling
AUC: 0.7988049620794342



### Impens

#### Train

In [128]:
x_train, x_val, y_train, y_val = train_test_split(octamers_impens_vec, y_train_impens, test_size=0.2, random_state=1, stratify=y_train_impens)
model_impens = baseline_model()
model_impens.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=init_callbacks('impens'), verbose=1)

Train on 756 samples, validate on 190 samples
Epoch 1/1000

Epoch 00001: val_auc improved from -inf to 0.51558, saving model to checkpoints/impens/impens_weights.h5
Epoch 2/1000

Epoch 00002: val_auc did not improve from 0.51558
Epoch 3/1000

Epoch 00003: val_auc did not improve from 0.51558
Epoch 4/1000

Epoch 00004: val_auc did not improve from 0.51558
Epoch 5/1000

Epoch 00005: val_auc did not improve from 0.51558
Epoch 6/1000

Epoch 00006: val_auc did not improve from 0.51558
Epoch 7/1000

Epoch 00007: val_auc improved from 0.51558 to 0.51635, saving model to checkpoints/impens/impens_weights.h5
Epoch 8/1000

Epoch 00008: val_auc improved from 0.51635 to 0.52370, saving model to checkpoints/impens/impens_weights.h5
Epoch 9/1000

Epoch 00009: val_auc improved from 0.52370 to 0.53165, saving model to checkpoints/impens/impens_weights.h5
Epoch 10/1000

Epoch 00010: val_auc improved from 0.53165 to 0.54153, saving model to checkpoints/impens/impens_weights.h5
Epoch 11/1000

Epoch 00011

<keras.callbacks.History at 0x7f2b82846e10>

#### Test

In [129]:
predict_tests(model_impens, 3, xs, ys)

746
AUC: 0.7678734606849136

1625
AUC: 0.7459379679144384

schilling
AUC: 0.8673251260093335



### Schilling

#### Train

In [None]:
x_train, x_val, y_train, y_val = train_test_split(octamers_schilling_vec, y_train_schilling, test_size=0.2, random_state=1, stratify=y_train_schilling)
model_schilling = baseline_model()
model_schilling.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=init_callbacks('schilling'), verbose=1)

Train on 2616 samples, validate on 655 samples
Epoch 1/1000

Epoch 00001: val_auc improved from -inf to 0.50703, saving model to checkpoints/schilling/schilling_weights.h5
Epoch 2/1000

Epoch 00002: val_auc improved from 0.50703 to 0.50792, saving model to checkpoints/schilling/schilling_weights.h5
Epoch 3/1000

Epoch 00003: val_auc improved from 0.50792 to 0.51419, saving model to checkpoints/schilling/schilling_weights.h5
Epoch 4/1000

Epoch 00004: val_auc improved from 0.51419 to 0.52946, saving model to checkpoints/schilling/schilling_weights.h5
Epoch 5/1000

Epoch 00005: val_auc improved from 0.52946 to 0.55212, saving model to checkpoints/schilling/schilling_weights.h5
Epoch 6/1000

Epoch 00006: val_auc improved from 0.55212 to 0.57960, saving model to checkpoints/schilling/schilling_weights.h5
Epoch 7/1000

Epoch 00007: val_auc improved from 0.57960 to 0.60562, saving model to checkpoints/schilling/schilling_weights.h5
Epoch 8/1000

Epoch 00008: val_auc improved from 0.60562 to 

#### Test

In [None]:
predict_tests(model_schilling, 4, xs, ys)

Como pode ser observado, a modelagem utilizando redes neurais recorrentes, não se demonstrou muito eficiente com 