<a href="https://colab.research.google.com/github/mchowdh200/exome-copy/blob/master/test_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#!pip install --upgrade pandas

In [0]:

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPool1D, Flatten, Dense, Dropout, LeakyReLU, BatchNormalization, Activation
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from tensorflow.keras.utils import normalize
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Load/Process Data

In [0]:
deletions = pd.read_pickle('/content/gdrive/My Drive/Colab Notebooks/deletions.pkl')

In [0]:
duplications = pd.read_pickle('/content/gdrive/My Drive/Colab Notebooks/duplications.pkl') 

In [0]:
non_sv = pd.read_pickle('/content/gdrive/My Drive/Colab Notebooks/non_sv.pkl') 

In [7]:
print(non_sv.data.values.shape)
print(deletions.data.values.shape)
print(duplications.data.values.shape)

(105339,)
(48939,)
(19485,)


In [0]:
data = np.concatenate((
    non_sv.data.values,
    deletions.data.values,
    duplications.data.values
))

labels = np.concatenate((
    np.zeros((len(non_sv),)),
    np.full((len(deletions,)), fill_value=1),
    np.full((len(duplications,)), fill_value=2)
))

In [0]:
labels = to_categorical(labels)

In [0]:
#data[0].shape
#print(pad_sequences(data, maxlen=225).shape)
# data_padded = [pad_sequences(normalize(d, axis=0), maxlen=500) for d in data]
data_padded = [pad_sequences(d, maxlen=500) for d in data]

In [0]:
data_padded = np.array(data_padded)

In [0]:
data_padded.shape

## Define Model

In [0]:
model = Sequential([
    #BatchNormalization(input_shape=data_padded.shape[1:]),
    Conv1D(input_shape=data_padded.shape[1:],
           filters=128,
           kernel_size=6,
           strides=1,
           dilation_rate=1,
           data_format='channels_first',
           kernel_initializer='glorot_uniform'),
    Dropout(0.25),
    BatchNormalization(),
    Activation(LeakyReLU()),
    MaxPool1D(pool_size=3),
    
    Conv1D(filters=256,
           kernel_size=6,
           strides=1,
           dilation_rate=1,
           data_format='channels_first',
           kernel_initializer='glorot_uniform'),
    BatchNormalization(),
    Activation(LeakyReLU()),
    Dropout(0.25),
    MaxPool1D(pool_size=3),
    
    Conv1D(filters=512,
           kernel_size=6,
           strides=1,
           dilation_rate=1,
           data_format='channels_first',
           kernel_initializer='glorot_uniform'),
    BatchNormalization(),
    Activation(LeakyReLU()),
    Dropout(0.25),
    MaxPool1D(pool_size=3),
    
    Flatten(), 
    Dense(3, activation='softmax')
])

model.compile(optimizer=Adam(clipnorm=1, amsgrad=False), 
              loss='categorical_crossentropy',
              metrics=['accuracy'])
X_train, X_val, y_train, y_val = train_test_split(data_padded, labels, stratify=labels, test_size=0.1)

In [0]:
callbacks = [EarlyStopping(patience=4),
             ReduceLROnPlateau(patience=3, factor=0.2)]
model.fit(X_train, y_train,
          epochs=20,
          batch_size=256,
          verbose=1,
          validation_data=(X_val, y_val),
          callbacks=callbacks)