**Note:** The actual training was done in a Python script that corresponds to this notebook. The only difference is that the model gets saved in the end as well.

In [None]:
from os import path

import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Reshape, GlobalAveragePooling1D, Layer, Input
from keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D, BatchNormalization, Activation, Add
from keras.utils import np_utils
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import (
    roc_auc_score, f1_score, confusion_matrix,
    plot_precision_recall_curve, plot_roc_curve)

from sklearn.utils.class_weight import compute_class_weight

from sklearn.metrics import (
    roc_auc_score, f1_score, balanced_accuracy_score,
    average_precision_score, precision_recall_curve,
    confusion_matrix, auc, roc_curve,
    plot_precision_recall_curve, plot_roc_curve,
    recall_score, precision_score, auc)

from cf_matrix import make_confusion_matrix
from data_io import get_data, balance_out
from data_representation import (
    kmer_counts, one_hot_encoding, kmer_embeddings)
from utils import (
    get_class_distribution, binarize, plot_class_distribution)

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Data

In [None]:
k = 1
species = 'human'
form = 'split'
kmer_representation = '1-hot'  # either '1-hot' or 'dense'
drop = 0.0

In [None]:
X_train_raw, y_train = get_data(species, form=form, mode='train', k=k, drop=drop)

In [None]:
X_val_raw, y_val = get_data(species, form=form, mode='val', k=k)

In [None]:
X_test_raw, y_test = get_data(species, form=form, mode='test', k=k)

In [None]:
y_train[y_train < 0] = 0
y_val[y_val < 0] = 0
y_test[y_test < 0] = 0

### Representation

In [None]:
if kmer_representation == '1-hot':
    X_train = one_hot_encoding(X_train_raw, k=k, form='2d')
    X_val = one_hot_encoding(X_val_raw, k=k,form='2d')
    X_test = one_hot_encoding(X_test_raw, k=k,form='2d')
elif kmer_representation == 'dense':
    X_train = kmer_embeddings(X_train_raw, k=6, to_split=True)
    X_val = kmer_embeddings(X_val_raw, k=6, to_split=True)
    X_test = kmer_embeddings(X_test_raw, k=6, to_split=True)

In [None]:
cy_train = to_categorical(y_train)
cy_val = to_categorical(y_val)
cy_test = to_categorical(y_test)

## Model

In [None]:
SEQUENCE_LENGTH = X_train.shape[1]
N_CHANNELS = X_train.shape[2]

INPUT_SHAPE = N_CHANNELS * SEQUENCE_LENGTH
N_CLASSES = 2

N_FILT = 32
KERNEL_SIZE = 11

### SpliceAI Model

We also experimented with a model that is more faithful to the SpliceAI architecture, but we did not end up using it.

In [None]:
class Residual(Layer):
    def __init__(self, n_filters, kernel_size, dilation_rate, **kwargs):
        
        super(Residual, self).__init__(**kwargs)
        
        self.n_filters = n_filters
        self.kernel_size = kernel_size
        self.dilation_rate = dilation_rate
        
        self.batch_norm1 = BatchNormalization()
        
        self.act1 = Activation('relu')
        
        self.conv_1 = Conv1D(
            self.n_filters,
            self.kernel_size,
            dilation_rate=self.dilation_rate,
            padding='causal')
        
        self.batch_norm2 = BatchNormalization()
        
        self.act2 = Activation('relu')
        
        self.conv_2 = Conv1D(
            self.n_filters,
            self.kernel_size,
            dilation_rate=self.dilation_rate,
            padding='causal')
        
        self.add1 = Add()

    def call(self, x):
        
        first_layer = x.copy()
        
        x = self.batch_norm1(x)
        
        x = self.act1(x)
        
        x = self.conv_1(x)
        
        x = self.batch_norm2(x)
        
        x = self.act2(x)
        
        x = self.conv_2(x)
        
        x = self.add1([x, first_layer])
        
        return x

    def compute_output_shape(self, input_shape):
        return input_shape
    

    
X_input = Input(shape=(SEQUENCE_LENGTH, N_CHANNELS))

X = Conv1D(
    N_FILT, 1, dilation_rate=1, padding='causal')(X_input)

X_r1 = Conv1D(
    N_FILT, 1, dilation_rate=1, padding='causal')(X)

X = Residual(N_FILT, KERNEL_SIZE, dilation_rate=1)(X)
X = Residual(N_FILT, KERNEL_SIZE, dilation_rate=1)(X)
X = Residual(N_FILT, KERNEL_SIZE, dilation_rate=1)(X)
X = Residual(N_FILT, KERNEL_SIZE, dilation_rate=1)(X)

X_r2 = Conv1D(
    N_FILT, 1, dilation_rate=1, padding='causal')(X)

X = Residual(N_FILT, KERNEL_SIZE, dilation_rate=4)(X)
X = Residual(N_FILT, KERNEL_SIZE, dilation_rate=4)(X)
X = Residual(N_FILT, KERNEL_SIZE, dilation_rate=4)(X)
X = Residual(N_FILT, KERNEL_SIZE, dilation_rate=4)(X)

X = Conv1D(
    N_FILT, 1, dilation_rate=1, padding='causal')(X)

X_r = Add()([X_r1, X_r2])

X = Add()([X_r, X])

X = Conv1D(3, 1, dilation_rate=1, padding='causal')(X)

X = Flatten()(X)

X = Dense(N_CLASSES, activation='softmax')(X)

model = Model(inputs=X_input, outputs=X, name='SpliceAI-400')

print(model.summary())

### Our Model

In [None]:
model = Sequential()

model.add(Conv1D(N_FILT, 1, dilation_rate=1, activation='relu', padding='causal', input_shape=(SEQUENCE_LENGTH, N_CHANNELS)))
model.add(Conv1D(N_FILT, KERNEL_SIZE, dilation_rate=1, activation='relu', padding='causal'))
model.add(Conv1D(N_FILT, KERNEL_SIZE, dilation_rate=1, activation='relu', padding='causal'))
model.add(Conv1D(N_FILT, KERNEL_SIZE, dilation_rate=1, activation='relu', padding='causal'))

model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Conv1D(N_FILT, KERNEL_SIZE, dilation_rate=4, activation='relu', padding='causal'))
model.add(Conv1D(N_FILT, KERNEL_SIZE, dilation_rate=4, activation='relu', padding='causal'))
model.add(Conv1D(N_FILT, KERNEL_SIZE, dilation_rate=4, activation='relu', padding='causal'))
model.add(Conv1D(N_FILT, KERNEL_SIZE, dilation_rate=4, activation='relu', padding='causal'))

model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(N_CLASSES, activation='softmax'))

print(model.summary())

In [None]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam', 
)

BATCH_SIZE = 16
EPOCHS = 20

history = model.fit(
    X_train,
    cy_train,
    batch_size=BATCH_SIZE,
    class_weight={0: 1, 1: 4},
    epochs=EPOCHS,
    validation_data=(X_val, cy_val),
    verbose=1)

### Evaluation

In [None]:
y_pred = model.predict(X_test.astype)
y_probs = y_pred[:, 1]
y_pred_1d = np.argmax(y_pred, axis=1)

In [None]:
print(f'Recall = {recall_score(y_test, y_pred_1d)}.')
print(f'Precision = {precision_score(y_test, y_pred_1d)}.')
print(f'F1 score = {f1_score(y_test, y_pred_1d, average="macro")}.')
print(f'Balanced accuracy score = {balanced_accuracy_score(y_test, y_pred_1d)}.')
print(f'AUROC = {roc_auc_score(y_test, y_probs, average="macro")}.')
print(f'AUPRC = {average_precision_score(y_test, y_probs, average="macro")}.')