In [2]:
import numpy as np
import progressbar

In [33]:
def _calc_empty_rows(X):
    empty_row = np.zeros((1, 4))
    empty_rows = [int(v) for v in np.all(empty_row == X, axis=1)]
    return empty_rows

def create_dataset_with_neighbourhood(X_paths, y_paths, neighbourhood_size):
    if not len(X_paths) == len(y_paths):
        raise ValueError('Number of X_paths and y_paths shoulde be the same!')
    
    new_X, new_y = list(), list()
    for X_path, y_path in zip(X_paths, y_paths):
        print('Parsing ', X_path, ' and ', y_path)
        
        X, y = np.load(X_path), np.load(y_path)
        # Removing last column which everything which was not 'A' nor 'C' nor 'G' nor 'T'.
        y = y[:, :4]
        
        empty_rows = _calc_empty_rows(X)
        
        print('Creating dataset with neighbrouhood ...')
        with progressbar.ProgressBar(max_value=X.shape[0]) as progress_bar:
            # TODO(ajuric): Check if this can be speed up.
            for i in range(X.shape[0]):
                progress_bar.update(i)
                if empty_rows[i] == 1:
                    continue # current row is empty row
                if i < neighbourhood_size or i >= X.shape[0]-neighbourhood_size:
                    continue # current position is not suitible to build example
                zeros_to_left = np.sum(empty_rows[i-neighbourhood_size:i])
                zeros_to_right = np.sum(empty_rows[i+1:i+neighbourhood_size+1])
                if zeros_to_left == 0 and zeros_to_right == 0:
                    new_X.append(X[i-neighbourhood_size:i+neighbourhood_size+1])
                    new_y.append(y[i])
    
    return new_X, new_y

In [34]:
X_paths = ['./e-coli-NCTC86-pysam-X-dataset.npy', './m-morgani-NCTC235-pysam-X-dataset.npy',
           './s-enterica-NCTC92-pysam-X-dataset.npy', './s-enterica-NXTC129-pysam-X-dataset.npy']
y_paths = ['./e-coli-NCTC86-pysam-y-dataset.npy', './m-morgani-NCTC235-pysam-y-dataset.npy',
           './s-enterica-NCTC92-pysam-y-dataset.npy', './s-enterica-NXTC129-pysam-y-dataset.npy']
neighbourhood_size = 3

X, y = create_dataset_with_neighbourhood(X_paths, y_paths, neighbourhood_size)

Parsing  ./e-coli-NCTC86-pysam-X-dataset.npy  and  ./e-coli-NCTC86-pysam-y-dataset.npy
Calculating empty rows ...


  0% (16451 of 4641652) |                | Elapsed Time: 0:00:00 ETA:   0:00:56

Creating dataset with neighbrouhood ...


100% (4641652 of 4641652) |##############| Elapsed Time: 0:00:54 Time:  0:00:54


Parsing  ./m-morgani-NCTC235-pysam-X-dataset.npy  and  ./m-morgani-NCTC235-pysam-y-dataset.npy
Calculating empty rows ...


  0% (15543 of 3799539) |                | Elapsed Time: 0:00:00 ETA:   0:00:48

Creating dataset with neighbrouhood ...


100% (3799539 of 3799539) |##############| Elapsed Time: 0:00:06 Time:  0:00:06


KeyboardInterrupt: 

In [23]:
np.save('./pysam-all-dataset-n3-X.npy', X)
np.save('./pysam-all-dataset-n3-y.npy', y)

In [4]:
import os
import sys
module_path = '/home/diplomski-rad/consensus-net/src/python/dataset'
if not module_path in sys.path:
    print('dodajeeem!')
    sys.path.append(module_path)

import dataset

In [5]:
X, y, X_train, X_validate, y_train, y_validate = dataset.read_dataset_and_reshape_for_conv(
    './pysam-all-dataset-n3-X.npy', './pysam-all-dataset-n3-y.npy', 0.1)

  0% (19521 of 16691589) |               | Elapsed Time: 0:00:00 ETA:   0:02:50

X shape before reshaping: (16691589, 7, 4)
y shape before reshaping: (16691589, 4)


100% (16691589 of 16691589) |############| Elapsed Time: 0:03:06 Time:  0:03:06


X shape after reshaping: (16691589, 7, 1, 4)
y shape after reshaping: (16691589, 4)
Splitting to train and validation set.
X_train shape: (15022430, 7, 1, 4)
X_validate shape: (1669159, 7, 1, 4)
y_train: (15022430, 4)
y_validate: (1669159, 4)


In [7]:
from keras.models import Model
from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization, Input
from keras.layers import Conv1D, MaxPooling1D, Conv2D
from keras.utils import np_utils


input_layer = Input(shape=(7, 1, 4))
conv_1 = Conv2D(filters=4, kernel_size=3, padding='same', activation='relu')(input_layer)

flatten = Flatten()(conv_1)
predictions = Dense(4, activation='softmax')(flatten)

model = Model(input_layer, predictions)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

batch_size = 10000
epochs = 200

model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_validate, y_validate))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 7, 1, 4)           0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 7, 1, 4)           148       
_________________________________________________________________
flatten_1 (Flatten)          (None, 28)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 116       
Total params: 264
Trainable params: 264
Non-trainable params: 0
_________________________________________________________________
None
Train on 15022430 samples, validate on 1669159 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200

KeyboardInterrupt: 