In [1]:
import numpy as np
import pickle
import keras.backend as K

from keras.layers import  GlobalAveragePooling1D, Input, Activation, MaxPooling1D, BatchNormalization, Dense, Dropout, Conv1D,GlobalMaxPooling1D
from keras.layers import GRU,AveragePooling1D,CuDNNGRU
from keras.layers.merge import Concatenate
from keras.models import Model 
from keras.callbacks import EarlyStopping,ModelCheckpoint
from gensim.models import Word2Vec 

import keras.backend.tensorflow_backend as KTF
import tensorflow as tf
import os


os.environ["CUDA_VISIBLE_DEVICES"] = "0"

config = tf.ConfigProto()
config.gpu_options.allow_growth=True   #不全部占满显存, 按需分配
sess = tf.Session(config=config)

KTF.set_session(sess)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
class TermVectors:
    def __init__(self):
        self.termVectorDict=dict()
        self.terms=None
        self.vectors=None

    def parse_term_embedding_file(self,file_path):
        with open(file_path) as f:
            lines=f.readlines()
            self.vectors=lines[1:] # 第一个是term的数量，不是termID
            terms = []
            for line in lines[1:]:
                term=line.split()[0]
                terms.append(term)
            self.terms = terms

    def str_to_vector(self,Str):
        Str = Str.strip('\n')
        nums = Str.split()
        vec = []
        for num in nums:
            vec.append(float(num))
        return np.array(vec)

    def construct_term_vector_dict(self):
        for term in self.terms:
            termindex = self.terms.index(term)
            line=self.vectors[termindex]
            s =line[len(term):].lstrip(' ')
            s_vec =self.str_to_vector(s)
            self.termVectorDict[term]=s_vec

In [3]:
# construct Node2Vec termVector
BP_TERM_EMB_FILE_PATH = '../Node2Vec/EDGELIST/BP.emb'
CC_TERM_EMB_FILE_PATH = '../Node2Vec/EDGELIST/CC.emb'
MF_TERM_EMB_FILE_PATH = '../Node2Vec/EDGELIST/MF.emb'
Node2Vec_dim = 300


BPTermVectors = TermVectors()
BPTermVectors.parse_term_embedding_file(BP_TERM_EMB_FILE_PATH)
BPTermVectors.construct_term_vector_dict()
print(len(BPTermVectors.termVectorDict))

CCTermVectors = TermVectors()
CCTermVectors.parse_term_embedding_file(CC_TERM_EMB_FILE_PATH)
CCTermVectors.construct_term_vector_dict()
print(len(CCTermVectors.termVectorDict))

MFTermVectors = TermVectors()
MFTermVectors.parse_term_embedding_file(MF_TERM_EMB_FILE_PATH)
MFTermVectors.construct_term_vector_dict()
print(len(MFTermVectors.termVectorDict))

29699
4202
11148


In [4]:
# read go.obo obtain ontology type
id_type_dicts = {}
obo_file = '../cross-species/go.obo'
fp=open(obo_file,'r')
obo_txt=fp.read()
fp.close()
obo_txt=obo_txt[obo_txt.find("[Term]")-1:]
obo_txt=obo_txt[:obo_txt.find("[Typedef]")]
# obo_dict=parse_obo_txt(obo_txt)
id_type_dicts = {}
for Term_txt in obo_txt.split("[Term]\n"):
    if not Term_txt.strip():
        continue
    name = ''
    ids = []
    for line in Term_txt.splitlines():
        if   line.startswith("id: "):
            ids.append(line[len("id: "):])     
        elif line.startswith("namespace: "):
             name=line[len("namespace: "):]
        elif line.startswith("alt_id: "):
            ids.append(line[len("alt_id: "):])
    
    for t_id in ids:
        id_type_dicts[t_id] = name

In [5]:
import numpy as np
alphabet = np.array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
                     'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])

def label_sequence(line, MAX_SEQ_LEN, smi_ch_ind):
	X = np.zeros(MAX_SEQ_LEN)

	for i, ch in enumerate(line[:MAX_SEQ_LEN]):
		X[i] = smi_ch_ind[ch]

	return X #.tolist()

def letter_one_hot(aa):
    one_hot = np.zeros(20)
    for idx, letter in enumerate(alphabet):
        if aa == letter:
            one_hot[idx] = 1
            return one_hot


# Convert an entire protein to one-hot representation.
def protein_one_hot(protein_sequence, MAX_SEQ_LEN):
    #  Remove non-specific AA codes (very few are actually present in this dataset)
    protein_sequence = protein_sequence.replace('B', '')
    protein_sequence = protein_sequence.replace('J', '')
    protein_sequence = protein_sequence.replace('O', '')
    protein_sequence = protein_sequence.replace('U', '')
    protein_sequence = protein_sequence.replace('X', '')
    protein_sequence = protein_sequence.replace('Z', '')
    one_hot_seq = np.zeros( (MAX_SEQ_LEN, 20))
    for idx, aa in enumerate(protein_sequence[:MAX_SEQ_LEN]):
        one_hot_seq[idx, :] = letter_one_hot(aa)
    return one_hot_seq


In [6]:
import keras
feature_len = 768
max_go_len = 256
max_seq_len = 1000
max_node_len = 128
from six.moves import cPickle as pickle #for performance

 
def save_dict(di_, filename_):
    with open(filename_, 'wb') as f:
        pickle.dump(di_, f)

def load_dict(filename_):
    with open(filename_, 'rb') as f:
        ret_di = pickle.load(f)
    return ret_di



protein2go = load_dict('ATprot2go.pkl')
prot2nodevec = {}
for key, value in protein2go.items():
    X_go1 =  np.zeros((1,Node2Vec_dim))
    allgos = value.split(',') 
    allgos = list(set(allgos))
    count = 0
    for  go in  allgos:
        if go.startswith('GO'):
            if id_type_dicts[go] == 'biological_process':
                termVectors = BPTermVectors
                term_ID=go[3:].lstrip('0')
                if term_ID in termVectors.termVectorDict.keys():
                    feature = termVectors.termVectorDict[term_ID].reshape(1, Node2Vec_dim)
                else:
                    feature = np.zeros((1,Node2Vec_dim))
                
            elif id_type_dicts[go] == 'cellular_component':
                termVectors = CCTermVectors
                term_ID=go[3:].lstrip('0')
                if term_ID in termVectors.termVectorDict.keys():
                    feature = termVectors.termVectorDict[term_ID].reshape(1, Node2Vec_dim)
                else:
                    feature = np.zeros((1,Node2Vec_dim))
            elif id_type_dicts[go] == 'molecular_function':
                termVectors = MFTermVectors
                term_ID=go[3:].lstrip('0')
                if term_ID in termVectors.termVectorDict.keys():
                    feature = termVectors.termVectorDict[term_ID].reshape(1, Node2Vec_dim)
                else:
                    feature = np.zeros((1,Node2Vec_dim))
            else:
                 feature = np.zeros((1,Node2Vec_dim))


            if count + feature.shape[0] > max_node_len:
                break
            X_go1 = np.concatenate((X_go1,feature ))    
            count += feature.shape[0]
    prot2nodevec[key] =  X_go1[1:]   

In [7]:

prot2emb = {}
for key, value in protein2go.items():
    X_go1 =  np.zeros((1,768))
    allgos = value.split(',') 
    allgos = list(set(allgos))
    count = 0
    for  go in  allgos:
        if go.startswith('GO'):
            feature = np.load('../ncbi_allfeatures4go/'+go+'_0.npy')[1:-1]
            if count + feature.shape[0] > max_go_len:
                break
            X_go1 = np.concatenate((X_go1,feature ))    
            count += feature.shape[0]
    prot2emb[key] =  X_go1[1:]   


class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self,  ppi_pair_file, batch_size=128):
        'Initialization' 
        self.batch_size = batch_size
        self.ppi_pair_file = ppi_pair_file
         
        self.max_seqlen = max_seq_len
        self.max_golen = max_go_len
        self.max_node_len = max_node_len
        self.protein2go =  load_dict('ATprot2go.pkl')
        self.protein2seq = load_dict('ATprot2seq.pkl')
        self.read_ppi()
        self.prot2emb =  prot2emb
#         self.prot2embedding()
        self.protein2onehot = {}
        self.onehot_seqs()
        self.prot2nodevec = prot2nodevec
#         self.prot2nodevec_fun()
        self.on_epoch_end()
    
#     def prot2nodevec_fun(self):
#         for key, value in self.protein2go.items():
#             X_go1 =  np.zeros((1,Node2Vec_dim))
#             allgos = value.split(',') 
#             allgos = list(set(allgos))
#             count = 0
#             for  go in  allgos:
#                 if go.startswith('GO'):
#                     if id_type_dicts[go] == '':
#                         termVectors = BPTermVectors
#                         term_ID=go[3:].lstrip('0')
#                         feature = termVectors.termVectorDict[term_ID]
#                     elif id_type_dicts[go] == '':
#                         termVectors = CCTermVectors
#                         term_ID=go[3:].lstrip('0')
#                         feature = termVectors.termVectorDict[term_ID]
#                     elif id_type_dicts[go] == '':
#                         termVectors = MFTermVectors
#                         term_ID=go[3:].lstrip('0')
#                         feature = termVectors.termVectorDict[term_ID]
#                     else:
#                         feature = np.zeros((1,Node2Vec_dim))
                    
                     
#                     if count + feature.shape[0] > max_go_len:
#                         break
#                     X_go1 = np.concatenate((X_go1,feature ))    
#                     count += feature.shape[0]
#             self.prot2nodevec[key] =  X_go1[1:]   
        
    
    def read_ppi(self):
        with open(self.ppi_pair_file, 'r') as f:
            self.ppi_pairs  =  f.readlines()
            
    
    def onehot_seqs(self):
        for key, value in self.protein2seq.items():
            self.protein2onehot[key] =  protein_one_hot(value, self.max_seqlen) 

    

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.ppi_pairs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # Generate data
        X, y = self.__data_generation(indexes)

        return X, y

    
#     def prot2embedding(self):
#         for key, value in self.protein2go.items():
#             X_go1 =  np.zeros((1,768))
#             allgos = value.split(',') 
#             allgos = list(set(allgos))
#             count = 0
#             for  go in  allgos:
#                 if go.startswith('GO'):
#                     feature = np.load('../ncbi_allfeatures4go/'+go+'_0.npy')[1:-1]
#                     if count + feature.shape[0] > max_go_len:
#                         break
#                     X_go1 = np.concatenate((X_go1,feature ))    
#                     count += feature.shape[0]
#             self.prot2emb[key] =  X_go1[1:]   
            
    
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.ppi_pairs))
         
    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization

        X_go1 = np.empty((self.batch_size, self.max_golen,768))
        X_go2 = np.empty((self.batch_size, self.max_golen,768))
        y = np.empty((self.batch_size))
        X_seq1 = np.empty((self.batch_size, self.max_seqlen,20))
        X_seq2 = np.empty((self.batch_size, self.max_seqlen,20))
        
        
        X_node1 = np.empty((self.batch_size, self.max_node_len,Node2Vec_dim))
        X_node2 = np.empty((self.batch_size, self.max_node_len,Node2Vec_dim))

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            ppi_pair = self.ppi_pairs[ID]
            p1, p2, label = ppi_pair.rstrip().split('\t')
            if label == '+':
                y[i] = 1
            else:
                y[i] = 0
                
            prot1emb = self.prot2emb[p1]
            X_go1[i,:prot1emb.shape[0]] = prot1emb
            
            prot2emb = self.prot2emb[p2]
            X_go2[i,:prot2emb.shape[0]] = prot2emb
            
            X_seq1[i] =  self.protein2onehot[p1]
            X_seq2[i] =  self.protein2onehot[p2]
            
            prot1emb_node = self.prot2nodevec[p1]
            X_node1[i,:prot1emb_node.shape[0]] = prot1emb_node
            
            prot2emb_node = self.prot2nodevec[p2]
            X_node2[i,:prot2emb_node.shape[0]] = prot2emb_node
            
            
            
     
        return [X_go1,X_go2,  X_node1, X_node2, X_seq1, X_seq2] ,  y



    def all_data(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization

        X_go1 = np.empty((len(list_IDs_temp), self.max_golen,768))
         
        X_go2 = np.empty((len(list_IDs_temp), self.max_golen,768))
        y = np.empty((len(list_IDs_temp)))
        
        X_seq1 = np.empty((len(list_IDs_temp), self.max_seqlen,20))
        X_seq2 = np.empty((len(list_IDs_temp), self.max_seqlen,20))
        
        
        X_node1 = np.empty((len(list_IDs_temp), self.max_node_len,Node2Vec_dim))
        X_node2 = np.empty((len(list_IDs_temp), self.max_node_len,Node2Vec_dim))

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            ppi_pair = self.ppi_pairs[ID]
            p1, p2, label = ppi_pair.rstrip().split('\t')
            if label == '+':
                y[i] = 1
            else:
                y[i] = 0
            
            prot1emb = self.prot2emb[p1]
            X_go1[i,:prot1emb.shape[0]] = prot1emb
            
            prot2emb = self.prot2emb[p2]
            X_go2[i,:prot2emb.shape[0]] = prot2emb
            
            X_seq1[i] =  self.protein2onehot[p1]
            X_seq2[i] =  self.protein2onehot[p2]
            
            prot1emb_node = self.prot2nodevec[p1]
            X_node1[i,:prot1emb_node.shape[0]] = prot1emb_node
            
            prot2emb_node = self.prot2nodevec[p2]
            X_node2[i,:prot2emb_node.shape[0]] = prot2emb_node
            
  
        return [X_go1,X_go2, X_node1, X_node2, X_seq1, X_seq2] ,  y

In [8]:
from keras import backend as K, initializers, regularizers, constraints
from keras.engine.topology import Layer


def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        # todo: check that this is correct
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


class Attention(Layer):
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Note: The layer has been tested with Keras 1.x
        Example:
            # 1
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
            # 2 - Get the attention scores
            hidden = LSTM(64, return_sequences=True)(words)
            sentence, word_scores = Attention(return_attention=True)(hidden)
        """
        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        eij = dot_product(x, self.W)

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        weighted_input = x * K.expand_dims(a)

        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]

In [9]:
from keras.layers import   Embedding
from keras.layers import  GRU, Bidirectional, CuDNNGRU, Lambda, dot, Flatten, CuDNNLSTM, Add
from keras.layers.merge import concatenate

def inception_block(input_tensor, output_size):
    """"""
    con1d_filters = int(output_size/4)
    y = Conv1D(con1d_filters, 3, activation="relu", padding='same')(input_tensor)
    x1 = Conv1D(con1d_filters, 5, activation="relu", padding='same')(y)

    y = Conv1D(con1d_filters, 1, activation="relu", padding='valid')(input_tensor)
    x2 = Conv1D(con1d_filters, 3, activation="relu", padding='same')(y)

    x3 = Conv1D(con1d_filters, 3, activation="relu", padding='same')(input_tensor)
    x4 = Conv1D(con1d_filters, 1, activation="relu", padding='same')(input_tensor)

    y = Concatenate()([x1, x2, x3, x4])
#     y = MaxPooling1D(2)(y)
    # y = AveragePooling1D()(mix0)
#     y = BatchNormalization()(y)

    return y


 
def build_cnn_gru_model(input_x, con_filters, gru_units):
    x = inception_block(input_x,con_filters )
    x = Dropout(0.3)(x)
    x_gru = Bidirectional(CuDNNGRU(gru_units, return_sequences=True))(x)
    x_gru = Dropout(0.3)(x_gru)
     
    x_a = GlobalAveragePooling1D()(x)
    x_b = GlobalMaxPooling1D()(x)
    x_c = Attention()(x)
    x_gru_a = GlobalAveragePooling1D()(x_gru)
    x_gru_b = GlobalMaxPooling1D()(x_gru)
    x_gru_c = Attention()(x_gru)
    x = Concatenate()([ x_a, x_b, x_c,  x_gru_a, x_gru_b,   x_gru_c])
    x = Dense(256)(x)
    return x


def build_cnn_model(input_x, con_filters, gru_units):
    x = inception_block(input_x,con_filters )
    x = Dropout(0.3)(x)
    
     
    x_a = GlobalAveragePooling1D()(x)
    x_b = GlobalMaxPooling1D()(x)
    x_c = Attention()(x)
    
    x = Concatenate()([ x_a, x_b, x_c])
    x = Dense(256)(x)
    return x 


def build_model():
    con_filters = 128
    gru_units = 64
    left_input_go = Input(shape=(max_go_len,feature_len))
    right_input_go = Input(shape=(max_go_len,feature_len))
    
    left_input_node = Input(shape=(max_node_len,Node2Vec_dim))
    right_input_node = Input(shape=(max_node_len,Node2Vec_dim))
    
    
    left_input_seq = Input(shape=(max_seq_len,20))
    right_input_seq = Input(shape=(max_seq_len,20))
    

    left_x_go = build_cnn_gru_model(left_input_go, con_filters, gru_units)
    right_x_go = build_cnn_gru_model(right_input_go, con_filters,gru_units)
    
    left_x_seq = build_cnn_gru_model(left_input_seq, con_filters, gru_units)
    right_x_seq = build_cnn_gru_model(right_input_seq, con_filters, gru_units)
    
    left_x_node = build_cnn_gru_model(left_input_node, con_filters, gru_units)
    right_x_node = build_cnn_gru_model(right_input_node, con_filters,gru_units)
    
   
    
   
   
    x =   Concatenate()([left_x_go  , right_x_go, left_x_node,  right_x_node, left_x_seq, right_x_seq])
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(512, activation='relu')(x)
  
    x = Dense(1)(x)
    output = Activation('sigmoid')(x)
    # model = Model([left_input_go, right_input_go], output)
  
    model = Model([left_input_go, right_input_go, left_input_node, right_input_node,     left_input_seq, right_input_seq], output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


 

model = build_model()
model.summary()
# siamese_a = create_share_model()
# siamese_a.summary()





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 256, 768)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 256, 768)     0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 128, 300)     0                                            
___________________________________________________________________________

In [10]:
from sklearn.model_selection import StratifiedKFold
from keras.models import load_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
dataset_name = 'SC'
for rep in range(2,3):
    n_splits = 10
    TPRs =  np.zeros(n_splits)
    FPRs = np.zeros(n_splits)
    Precs = np.zeros(n_splits)
    ACCs = np.zeros(n_splits)
    F1s = np.zeros(n_splits)
    MCCs = np.zeros(n_splits)
    AUCs = np.zeros(n_splits)
    skf = StratifiedKFold(n_splits=n_splits)
    count = 0
    for split in range(n_splits):
        train_pairs_file = 'CV/train'+str(rep)+'-'+str(split)
        test_pairs_file = 'CV/test'+str(rep)+'-'+str(split)
        valid_pairs_file = 'CV/valid'+str(rep)+'-'+str(split)
        
         

        batch_size = 64
        train_generator = DataGenerator(   train_pairs_file,batch_size = batch_size )
        test_generator = DataGenerator(   test_pairs_file,batch_size = batch_size)
        valid_generator = DataGenerator(   valid_pairs_file,batch_size = batch_size)
         
        # model = build_model_without_att()
        model = build_model()
        save_model_name = 'CV/node_go_seq'+str(rep)+'-'+str(split) + '.hdf5'
        
        earlyStopping = EarlyStopping(monitor='val_acc', patience=20, verbose=0, mode='max')
        save_checkpoint = ModelCheckpoint(save_model_name, save_best_only=True,  verbose=1, monitor='val_acc', mode='max', save_weights_only=True)

         
        # validation_data = (valid_X, valid_Y),  verbose=1,callbacks=[earlyStopping, save_checkpoint]
        hist = model.fit_generator(generator=train_generator,
                   epochs = 100,verbose=1,validation_data = valid_generator,
                                 callbacks=[earlyStopping, save_checkpoint] )
         
        
        # model = load_model(save_model_name)
        model.load_weights(save_model_name)
        with open(test_pairs_file, 'r') as f:
            test_ppi_pairs  =  f.readlines()

        test_len = len(test_ppi_pairs) 
        list_IDs_temp = np.arange(test_len)

        test_x, y_test = test_generator.all_data(list_IDs_temp)

        y_pred_prob = model.predict(test_x)

       
        y_pred = (y_pred_prob > 0.5)
        auc = metrics.roc_auc_score(y_test, y_pred_prob) 
        f1 = f1_score(y_test, y_pred)
        pre = precision_score(y_test, y_pred)
        acc = accuracy_score(y_test, y_pred)

        precision, recall, _thresholds = metrics.precision_recall_curve(y_test, y_pred_prob)
        pr_auc = metrics.auc(recall, precision)
        mcc = matthews_corrcoef(y_test, y_pred)

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        total=tn+fp+fn+tp
        sen = float(tp)/float(tp+fn)
        sps = float(tn)/float((tn+fp))

        tpr = float(tp)/float(tp+fn)
        fpr = float(fp)/float((tn+fp))
        print('--------------------------\n')
        print ('AUC: %f' % auc)
        print ('ACC: %f' % acc) 
        # print("PRAUC: %f" % pr_auc)
        print ('MCC : %f' % mcc)
        # print ('SEN: %f' % sen)
        # print ('SEP: %f' % sps)
        print('TPR:%f'%tpr)
        print('FPR:%f'%fpr)
        print('Pre:%f'%pre)
        print('F1:%f'%f1)
        print('--------------------------\n')
        TPRs[count] = tpr
        FPRs[count] = fpr
        Precs[count] =pre
        ACCs[count] =acc
        F1s[count] =f1
        MCCs[count] =mcc
        AUCs[count] =auc
        count += 1
        del test_x
        del y_test
    print ('mean AUC: %f' % np.mean(AUCs))
    print ('mean ACC: %f' % np.mean(ACCs)) 
    print ('mean MCC : %f' % np.mean(MCCs))
    print('mean TPR:%f'% np.mean(TPRs))
    print('mean FPR:%f'% np.mean(FPRs))
    print('mean Pre:%f'% np.mean(Precs))
    print('mean F1:%f'% np.mean(F1s))
    np.savez('node2vec_go_seq'+str(rep), AUCs=AUCs, ACCs=ACCs, MCCs=MCCs, TPRs = TPRs, FPRs=FPRs, Precs=Precs, F1s=F1s)







Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.51562, saving model to CV/node_go_seq2-0.hdf5
Epoch 2/100

Epoch 00002: val_acc improved from 0.51562 to 0.64062, saving model to CV/node_go_seq2-0.hdf5
Epoch 3/100

Epoch 00003: val_acc improved from 0.64062 to 0.67188, saving model to CV/node_go_seq2-0.hdf5
Epoch 4/100

Epoch 00004: val_acc did not improve from 0.67188
Epoch 5/100

Epoch 00005: val_acc improved from 0.67188 to 0.70312, saving model to CV/node_go_seq2-0.hdf5
Epoch 6/100

Epoch 00006: val_acc did not improve from 0.70312
Epoch 7/100

Epoch 00007: val_acc did not improve from 0.70312
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.70312
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.70312
Epoch 10/100

Epoch 00010: val_acc did not improve from 0.70312
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.70312
Epoch 12/100

Epoch 00012: val_acc did not improve from 0.70312
Epoch 13/100

Epoch 00013: val_acc did not improve from 0.70312
Epoc


Epoch 00004: val_acc improved from 0.71875 to 0.73438, saving model to CV/node_go_seq2-1.hdf5
Epoch 5/100

Epoch 00005: val_acc did not improve from 0.73438
Epoch 6/100

Epoch 00006: val_acc improved from 0.73438 to 0.79688, saving model to CV/node_go_seq2-1.hdf5
Epoch 7/100

Epoch 00007: val_acc did not improve from 0.79688
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.79688
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.79688
Epoch 10/100

Epoch 00010: val_acc did not improve from 0.79688
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.79688
Epoch 12/100

Epoch 00012: val_acc did not improve from 0.79688
Epoch 13/100

Epoch 00013: val_acc did not improve from 0.79688
Epoch 14/100

Epoch 00014: val_acc did not improve from 0.79688
Epoch 15/100

Epoch 00015: val_acc improved from 0.79688 to 0.82812, saving model to CV/node_go_seq2-1.hdf5
Epoch 16/100

Epoch 00016: val_acc did not improve from 0.82812
Epoch 17/100

Epoch 00017: val_acc did not improve fro


Epoch 00001: val_acc improved from -inf to 0.50000, saving model to CV/node_go_seq2-2.hdf5
Epoch 2/100

Epoch 00002: val_acc did not improve from 0.50000
Epoch 3/100

Epoch 00003: val_acc improved from 0.50000 to 0.65625, saving model to CV/node_go_seq2-2.hdf5
Epoch 4/100

Epoch 00004: val_acc did not improve from 0.65625
Epoch 5/100

Epoch 00005: val_acc did not improve from 0.65625
Epoch 6/100

Epoch 00006: val_acc did not improve from 0.65625
Epoch 7/100

Epoch 00007: val_acc improved from 0.65625 to 0.73438, saving model to CV/node_go_seq2-2.hdf5
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.73438
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.73438
Epoch 10/100

Epoch 00010: val_acc improved from 0.73438 to 0.75000, saving model to CV/node_go_seq2-2.hdf5
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.75000
Epoch 12/100

Epoch 00012: val_acc did not improve from 0.75000
Epoch 13/100

Epoch 00013: val_acc did not improve from 0.75000
Epoch 14/100

Ep


Epoch 00005: val_acc did not improve from 0.60938
Epoch 6/100

Epoch 00006: val_acc improved from 0.60938 to 0.67188, saving model to CV/node_go_seq2-3.hdf5
Epoch 7/100

Epoch 00007: val_acc did not improve from 0.67188
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.67188
Epoch 9/100

Epoch 00009: val_acc improved from 0.67188 to 0.71875, saving model to CV/node_go_seq2-3.hdf5
Epoch 10/100

Epoch 00010: val_acc did not improve from 0.71875
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.71875
Epoch 12/100

Epoch 00012: val_acc did not improve from 0.71875
Epoch 13/100

Epoch 00013: val_acc did not improve from 0.71875
Epoch 14/100

Epoch 00014: val_acc improved from 0.71875 to 0.73438, saving model to CV/node_go_seq2-3.hdf5
Epoch 15/100

Epoch 00015: val_acc did not improve from 0.73438
Epoch 16/100

Epoch 00016: val_acc improved from 0.73438 to 0.75000, saving model to CV/node_go_seq2-3.hdf5
Epoch 17/100

Epoch 00017: val_acc did not improve from 0.75000
Epoch 18/


Epoch 00008: val_acc improved from 0.68750 to 0.71875, saving model to CV/node_go_seq2-4.hdf5
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.71875
Epoch 10/100

Epoch 00010: val_acc did not improve from 0.71875
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.71875
Epoch 12/100

Epoch 00012: val_acc did not improve from 0.71875
Epoch 13/100

Epoch 00013: val_acc did not improve from 0.71875
Epoch 14/100

Epoch 00014: val_acc did not improve from 0.71875
Epoch 15/100

Epoch 00015: val_acc did not improve from 0.71875
Epoch 16/100

Epoch 00016: val_acc did not improve from 0.71875
Epoch 17/100

Epoch 00017: val_acc did not improve from 0.71875
Epoch 18/100

Epoch 00018: val_acc did not improve from 0.71875
Epoch 19/100

Epoch 00019: val_acc did not improve from 0.71875
Epoch 20/100

Epoch 00020: val_acc did not improve from 0.71875
Epoch 21/100

Epoch 00021: val_acc did not improve from 0.71875
Epoch 22/100

Epoch 00022: val_acc improved from 0.71875 to 0.76562, savin


Epoch 00008: val_acc did not improve from 0.76562
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.76562
Epoch 10/100

Epoch 00010: val_acc did not improve from 0.76562
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.76562
Epoch 12/100

Epoch 00012: val_acc did not improve from 0.76562
Epoch 13/100

Epoch 00013: val_acc did not improve from 0.76562
Epoch 14/100

Epoch 00014: val_acc did not improve from 0.76562
Epoch 15/100

Epoch 00015: val_acc did not improve from 0.76562
Epoch 16/100

Epoch 00016: val_acc did not improve from 0.76562
Epoch 17/100

Epoch 00017: val_acc did not improve from 0.76562
Epoch 18/100

Epoch 00018: val_acc did not improve from 0.76562
Epoch 19/100

Epoch 00019: val_acc did not improve from 0.76562
Epoch 20/100

Epoch 00020: val_acc did not improve from 0.76562
Epoch 21/100

Epoch 00021: val_acc did not improve from 0.76562
Epoch 22/100

Epoch 00022: val_acc did not improve from 0.76562
Epoch 23/100

Epoch 00023: val_acc did not improve fro


Epoch 00025: val_acc did not improve from 0.84375
Epoch 26/100

Epoch 00026: val_acc did not improve from 0.84375
Epoch 27/100

Epoch 00027: val_acc did not improve from 0.84375
Epoch 28/100

Epoch 00028: val_acc did not improve from 0.84375
Epoch 29/100

Epoch 00029: val_acc did not improve from 0.84375
Epoch 30/100

Epoch 00030: val_acc did not improve from 0.84375
Epoch 31/100

Epoch 00031: val_acc did not improve from 0.84375
Epoch 32/100

Epoch 00032: val_acc did not improve from 0.84375
--------------------------

AUC: 0.942926
ACC: 0.861111
MCC : 0.719501
TPR:0.898305
FPR:0.183673
Pre:0.854839
F1:0.876033
--------------------------

Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.64062, saving model to CV/node_go_seq2-7.hdf5
Epoch 2/100

Epoch 00002: val_acc did not improve from 0.64062
Epoch 3/100

Epoch 00003: val_acc did not improve from 0.64062
Epoch 4/100

Epoch 00004: val_acc did not improve from 0.64062
Epoch 5/100

Epoch 00005: val_acc did not improve from 0.6

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


--------------------------

AUC: 0.500000
ACC: 0.472222
MCC : 0.000000
TPR:1.000000
FPR:1.000000
Pre:0.472222
F1:0.641509
--------------------------

Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.48438, saving model to CV/node_go_seq2-8.hdf5
Epoch 2/100

Epoch 00002: val_acc improved from 0.48438 to 0.68750, saving model to CV/node_go_seq2-8.hdf5
Epoch 3/100

Epoch 00003: val_acc did not improve from 0.68750
Epoch 4/100

Epoch 00004: val_acc improved from 0.68750 to 0.73438, saving model to CV/node_go_seq2-8.hdf5
Epoch 5/100

Epoch 00005: val_acc did not improve from 0.73438
Epoch 6/100

Epoch 00006: val_acc did not improve from 0.73438
Epoch 7/100

Epoch 00007: val_acc did not improve from 0.73438
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.73438
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.73438
Epoch 10/100

Epoch 00010: val_acc did not improve from 0.73438
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.73438
Epoch 12/100

Epoch 00012:


Epoch 00018: val_acc did not improve from 0.71875
Epoch 19/100

Epoch 00019: val_acc did not improve from 0.71875
Epoch 20/100

Epoch 00020: val_acc did not improve from 0.71875
Epoch 21/100

Epoch 00021: val_acc did not improve from 0.71875
Epoch 22/100

Epoch 00022: val_acc did not improve from 0.71875
Epoch 23/100

Epoch 00023: val_acc did not improve from 0.71875
Epoch 24/100

Epoch 00024: val_acc did not improve from 0.71875
Epoch 25/100

Epoch 00025: val_acc did not improve from 0.71875
Epoch 26/100

Epoch 00026: val_acc improved from 0.71875 to 0.75000, saving model to CV/node_go_seq2-9.hdf5
Epoch 27/100

Epoch 00027: val_acc improved from 0.75000 to 0.76562, saving model to CV/node_go_seq2-9.hdf5
Epoch 28/100

Epoch 00028: val_acc did not improve from 0.76562
Epoch 29/100

Epoch 00029: val_acc did not improve from 0.76562
Epoch 30/100

Epoch 00030: val_acc did not improve from 0.76562
Epoch 31/100

Epoch 00031: val_acc did not improve from 0.76562
Epoch 32/100

Epoch 00032: va

In [12]:
import numpy as np 
results1 =   np.load( 'node2vec_go_seq0.npz')
results2 =   np.load( 'node2vec_go_seq1.npz')
results3 =   np.load( 'node2vec_go_seq2.npz')
print ('mean AUC: %f' %  ( (np.mean( results1[ 'AUCs' ] )  + np.mean(  results2[ 'AUCs' ] )  + np.mean(results3[ 'AUCs' ]))/3     ) )
print ('mean ACC: %f' %   ( (np.mean( results1[ 'ACCs' ] )  + np.mean(  results2[ 'ACCs' ] )  + np.mean(results3[ 'ACCs' ]))/3) )
print ('mean MCC : %f' %  (  (np.mean( results1[ 'MCCs' ] )  + np.mean(  results2[ 'MCCs' ] )  + np.mean(results3[ 'MCCs' ])     )/3))
print('mean TPR:%f'%    ((np.mean( results1[ 'TPRs' ] )  + np.mean(  results2[ 'TPRs' ] )  + np.mean(results3[ 'TPRs' ])     )/3))
print('mean FPR:%f'%   ( (np.mean( results1[ 'FPRs' ] )  + np.mean(  results2[ 'FPRs' ] )  + np.mean(results3[ 'FPRs' ])     )/3))
print('mean Pre:%f'%    ((np.mean( results1[ 'Precs' ] )  + np.mean(  results2[ 'Precs' ] )  + np.mean(results3[ 'Precs' ])     )/3))
print('mean F1:%f'%    ((np.mean( results1[ 'F1s' ] )  + np.mean(  results2[ 'F1s' ] )  + np.mean(results3[ 'F1s' ])     )/3))

mean AUC: 0.906283
mean ACC: 0.841498
mean MCC : 0.688606
mean TPR:0.829777
mean FPR:0.148819
mean Pre:0.867075
mean F1:0.838586
