In [1]:
from __future__ import print_function 
import os, sys, h5py
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
from six.moves import cPickle
from sklearn.metrics import roc_curve, auc, precision_recall_curve, accuracy_score, roc_auc_score

sys.path.append('..')
import helper
from deepomics import neuralnetwork as nn
from deepomics import utils, fit

In [2]:

def experiment_correpsondence(correspondence_path, experiments):
    rncmpt_names = []
    clip_names = []
    rbp_names = []
    cell_types = []
    with open(correspondence_path, 'rb') as f:
        for line in f:
            index = line.index('-')
            rncmpt_names.append(line[:index])
            experiment = line[index+1:].split()[0]
            index = experiment.index('_')
            rbp_names.append(experiment[:index])
            cell_types.append(experiment[index+1:])
            clip_names.append(experiment+'_200.h5')
            
    # get a dictionary of tuples for each rbp correspondence
    unique_rbps = np.unique(rbp_names)
    match = {}
    for j, rbp_name in enumerate(unique_rbps):
        match[rbp_name] = []
    for j, rbp_name in enumerate(rbp_names):
        rbp_index = np.where(experiments==rncmpt_names[j])[0][0]
        match[rbp_name].append((rbp_index, rncmpt_names[j], clip_names[j]))
    return match

def binding_affinity_scores(train, model, input_shape, output_shape, file_path):
    
    X_train = train['inputs']
    y_train = train['targets']
    
    # merge dataset
    index = np.where(y_train[:,0]==1)[0]
    X_pos = X_train[index]
    y_pos = y_train[index]
    index = np.where(y_train[:,0]==0)[0]
    X_neg = X_train[index]
    y_neg = y_train[index]
    num_split = X_pos.shape[1] - 41
    
    # load model
    genome_model = helper.import_model(model)
    model_layers, optimization = genome_model.model(input_shape, output_shape)
    
    # build neural network class
    nnmodel = nn.NeuralNet(seed=247)
    nnmodel.build_layers(model_layers, optimization, use_scope=False)

    nntrainer = nn.NeuralTrainer(nnmodel, save='best', file_path=file_path)

    # initialize session
    sess = utils.initialize_session(nnmodel.placeholders)
    
    # load best model
    nntrainer.set_best_parameters(sess, verbose=False)

    pos_score = []
    pos_max_score = []
    pos_mean_score = []
    for X in X_pos:
        X_split = []
        for i in range(num_split):
            X_split.append([X[i:i+41,:,:]])
        X_split = np.vstack(X_split)

        affinity = nntrainer.get_activations(sess, {'inputs': X_split})
        affinity = np.sort(affinity[:,0])[::-1]
        pos_max_score.append(affinity[0])
        pos_mean_score.append(np.mean(affinity))
        pos_score.append(np.mean(affinity[:20]))

    pos_score = np.array(pos_score)
    pos_max_score = np.array(pos_max_score)
    pos_mean_score = np.array(pos_mean_score)

    neg_score = []
    neg_max_score = []
    neg_mean_score = []
    for X in X_neg:
        X_split = []
        for i in range(num_split):
            X_split.append([X[i:i+41,:,:]])
        X_split = np.vstack(X_split)

        affinity = nntrainer.get_activations(sess, {'inputs': X_split})
        affinity = np.sort(affinity[:,0])[::-1]
        neg_max_score.append(affinity[0])
        neg_mean_score.append(np.mean(affinity))
        neg_score.append(np.mean(affinity[:20]))
    neg_score = np.array(neg_score)
    neg_mean_score = np.array(neg_mean_score)
    neg_max_score = np.array(neg_max_score)

    top_score = np.vstack([np.expand_dims(pos_score,axis=1), np.expand_dims(neg_score, axis=1)])
    mean_score = np.vstack([np.expand_dims(pos_mean_score,axis=1), np.expand_dims(neg_mean_score, axis=1)])
    max_score = np.vstack([np.expand_dims(pos_max_score,axis=1), np.expand_dims(neg_max_score, axis=1)])
    
    y_true = np.vstack([np.ones((len(pos_score),1)), np.zeros((len(neg_score),1))])
    return y_true, max_score, top_score, mean_score




In [None]:
model = 'affinity_residualbind' # 'affinity_residualbind'
rncmpt_name = 'RNCMPT00074'
clip_name = 'SRSF9_HepG2_200.h5'
ss_type = 'seq'
normalize_method = 'log_norm' 

In [None]:
# get list of encode-eclip experiments
data_path = '../../data/RNAcompete_2013/rnacompete2013.h5'
experiments = helper.get_experiments_hdf5(data_path)

rbp_index = np.where(rncmpt_name == experiments)[0][0]

# load rbp dataset
train, valid, test = helper.load_dataset_hdf5(data_path, ss_type=ss_type, rbp_index=rbp_index)

# process rbp dataset
train, valid, test = helper.process_data(train, valid, test, method=normalize_method)

input_shape = list(train['inputs'].shape)
input_shape[0] = None
output_shape = train['targets'].shape

# get corresponding clip-experiments for rnacompete experiments
correspondence_path = 'correspondences_eCLIP_RNACompete.txt'
match = experiment_correpsondence(correspondence_path, experiments)

# load clip-seq dataset
dataset_path = '/media/peter/storage/encode_eclip/eclip_datasets'
dataset_file_path = os.path.join(dataset_path, clip_name)
clip_train, clip_valid, clip_test = helper.load_dataset_hdf5(dataset_file_path, ss_type=ss_type)
X_train = np.vstack([clip_train['inputs'], clip_valid['inputs'], clip_test['inputs']])
y_train = np.vstack([clip_train['targets'], clip_valid['targets'], clip_test['targets']])
clip_train = {'inputs': X_train, 'targets': y_train}

results_path = helper.make_directory('../../results', 'RNAcompete_2013')
file_path = os.path.join(results_path, normalize_method+'_'+ss_type, model, rncmpt_name)

In [15]:
print(file_path)
y_true, max_score, top_score, mean_score = binding_affinity_scores(clip_train, model, input_shape, output_shape, file_path)

print(rbp_index, rncmpt_name, clip_name)

fpr, tpr, thresholds = roc_curve(y_true, mean_score)
roc_score = auc(fpr, tpr)
print('mean = ', roc_score)

fpr, tpr, thresholds = roc_curve(y_true, max_score)
roc_score = auc(fpr, tpr)
print('max  = ', roc_score)

fpr, tpr, thresholds = roc_curve(y_true, top_score)
roc_score = auc(fpr, tpr)
print('T20  = ', roc_score)

../../results/RNAcompete_2013/log_norm_seq/affinity_all_conv_net/RNCMPT00073
{'layer': 'input', 'input_shape': [None, 41, 1, 4]}
{'layer': 'conv1d', 'strides': 1, 'activation': 'leaky_relu', 'padding': 'SAME', 'filter_size': 11, 'num_filters': 16, 'norm': 'batch'}
{'padding': 'VALID', 'filter_size': 10, 'layer': 'conv1d', 'num_filters': 16, 'strides': 2, 'dropout': 0.1, 'activation': 'leaky_relu', 'norm': 'batch'}
{'layer': 'conv1d', 'strides': 1, 'activation': 'leaky_relu', 'padding': 'SAME', 'filter_size': 7, 'num_filters': 32, 'norm': 'batch'}
{'padding': 'VALID', 'filter_size': 7, 'layer': 'conv1d', 'strides': 2, 'num_filters': 32, 'activation': 'leaky_relu', 'dropout': 0.3, 'norm': 'batch'}
{'layer': 'conv1d', 'dropout': 0.4, 'padding': 'SAME', 'filter_size': 5, 'num_filters': 64, 'activation': 'leaky_relu', 'norm': 'batch'}
{'layer': 'conv1d', 'dropout': 0.4, 'padding': 'VALID', 'filter_size': 5, 'num_filters': 64, 'activation': 'leaky_relu', 'norm': 'batch'}
{'layer': 'dense', '

NotFoundError: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for ../../results/RNAcompete_2013/log_norm_seq/affinity_all_conv_net/RNCMPT00073_best.ckpt
	 [[Node: save/RestoreV2_30 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save/Const_0, save/RestoreV2_30/tensor_names, save/RestoreV2_30/shape_and_slices)]]
	 [[Node: save/RestoreV2_10/_35 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/gpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=1, tensor_name="edge_192_save/RestoreV2_10", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]

Caused by op u'save/RestoreV2_30', defined at:
  File "/home/peter/anaconda2/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/home/peter/anaconda2/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/home/peter/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/home/peter/anaconda2/lib/python2.7/site-packages/traitlets/config/application.py", line 589, in launch_instance
    app.start()
  File "/home/peter/anaconda2/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 442, in start
    ioloop.IOLoop.instance().start()
  File "/home/peter/anaconda2/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/home/peter/anaconda2/lib/python2.7/site-packages/tornado/ioloop.py", line 883, in start
    handler_func(fd_obj, events)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/peter/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 391, in execute_request
    user_expressions, allow_stdin)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 199, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2723, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2825, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/peter/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2885, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-7a525ea94b3d>", line 2, in <module>
    y_true, max_score, top_score, mean_score = binding_affinity_scores(clip_train, model, input_shape, output_shape, file_path)
  File "<ipython-input-5-efbdcd174afd>", line 55, in binding_affinity_scores
    nntrainer.set_best_parameters(sess, verbose=False)
  File "../deepomics/neuralnetwork.py", line 498, in set_best_parameters
    self.nnmodel.load_model_parameters(sess, file_path, verbose=verbose)
  File "../deepomics/neuralnetwork.py", line 168, in load_model_parameters
    saver = tf.train.Saver()
  File "/home/peter/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1051, in __init__
    self.build()
  File "/home/peter/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1081, in build
    restore_sequentially=self._restore_sequentially)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 675, in build
    restore_sequentially, reshape)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 402, in _AddRestoreOps
    tensors = self.restore_op(filename_tensor, saveable, preferred_shard)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 242, in restore_op
    [spec.tensor.dtype])[0])
  File "/home/peter/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/gen_io_ops.py", line 668, in restore_v2
    dtypes=dtypes, name=name)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
    op_def=op_def)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2395, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/peter/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1264, in __init__
    self._traceback = _extract_stack()

NotFoundError (see above for traceback): Unsuccessful TensorSliceReader constructor: Failed to find any matching files for ../../results/RNAcompete_2013/log_norm_seq/affinity_all_conv_net/RNCMPT00073_best.ckpt
	 [[Node: save/RestoreV2_30 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save/Const_0, save/RestoreV2_30/tensor_names, save/RestoreV2_30/shape_and_slices)]]
	 [[Node: save/RestoreV2_10/_35 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/gpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=1, tensor_name="edge_192_save/RestoreV2_10", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]


In [13]:
models = ['affinity_residualbind']
ss_type = 'seq'
normalize_method = 'log_norm' 

# get list of encode-eclip experiments
data_path = '../../data/RNAcompete_2013/rnacompete2013.h5'
experiments = helper.get_experiments_hdf5(data_path)

# get corresponding clip-experiments for rnacompete experiments
correspondence_path = 'correspondences_eCLIP_RNACompete.txt'
match = experiment_correpsondence(correspondence_path, experiments)

# directory for encode-clip experiments
dataset_path = '/media/peter/storage/encode_eclip/eclip_datasets'

results = []
for key in match.keys():
    for experiments in match[key]:

        for model in models:
            rbp_index = experiments[0]
            rncmpt_name = experiments[1]
            clip_name = experiments[2]


            print(rbp_index, rncmpt_name, clip_name, model)


            # load rbp dataset
            train, valid, test = helper.load_dataset_hdf5(data_path, ss_type=ss_type, rbp_index=rbp_index)

            # process rbp dataset
            train, valid, test = helper.process_data(train, valid, test, method=normalize_method)

            input_shape = list(train['inputs'].shape)
            input_shape[0] = None
            output_shape = train['targets'].shape
            
            # load clip-seq dataset
            dataset_file_path = os.path.join(dataset_path, clip_name)
            clip_train, clip_valid, clip_test = helper.load_dataset_hdf5(dataset_file_path, ss_type=ss_type)
            X_train = np.vstack([clip_train['inputs'], clip_valid['inputs'], clip_test['inputs']])
            y_train = np.vstack([clip_train['targets'], clip_valid['targets'], clip_test['targets']])
            clip_train = {'inputs': X_train, 'targets': y_train}

            results_path = helper.make_directory('../../results', 'RNAcompete_2013')
            #results_path = helper.make_directory('../../results', 'test')
            file_path = os.path.join(results_path, normalize_method+'_'+ss_type, model, rncmpt_name)
            y_true, max_score, top_score, mean_score = binding_affinity_scores(clip_train, model, input_shape, output_shape, file_path)

            fpr, tpr, thresholds = roc_curve(y_true, mean_score)
            roc_score = auc(fpr, tpr)
            print('mean = ', roc_score)

            fpr, tpr, thresholds = roc_curve(y_true, max_score)
            roc_score = auc(fpr, tpr)
            print('max  = ', roc_score)

            fpr, tpr, thresholds = roc_curve(y_true, top_score)
            roc_score = auc(fpr, tpr)
            print('T20  = ', roc_score)


188 RNCMPT00047 QKI_HepG2_200.h5 affinity_residualbind
mean =  0.834434448241
max  =  0.750360430262
T20  =  0.775964062917
188 RNCMPT00047 QKI_K562_200.h5 affinity_residualbind
mean =  0.863534182336
max  =  0.764704209166
T20  =  0.794567178357
173 RNCMPT00033 IGF2BP2_K562_200.h5 affinity_residualbind
mean =  0.429468737601
max  =  0.447641055013
T20  =  0.446980077622
6 RNCMPT00106 SRSF1_HepG2_200.h5 affinity_residualbind
mean =  0.865529197137
max  =  0.815291759729
T20  =  0.839197314446
7 RNCMPT00107 SRSF1_HepG2_200.h5 affinity_residualbind
mean =  0.882229693851
max  =  0.830341048328
T20  =  0.856919693673
8 RNCMPT00108 SRSF1_HepG2_200.h5 affinity_residualbind
mean =  0.863691557557
max  =  0.837281119394
T20  =  0.84940981986
9 RNCMPT00109 SRSF1_HepG2_200.h5 affinity_residualbind
mean =  0.859354839848
max  =  0.831068843841
T20  =  0.845979025321
11 RNCMPT00110 SRSF1_HepG2_200.h5 affinity_residualbind
mean =  0.713219467825
max  =  0.677149939838
T20  =  0.702281077508
64 RNC

In [4]:
models = ['affinity_residualbind']
ss_type = 'pu'
normalize_method = 'log_norm' 

# get list of encode-eclip experiments
data_path = '../../data/RNAcompete_2013/rnacompete2013.h5'
experiments = helper.get_experiments_hdf5(data_path)

# get corresponding clip-experiments for rnacompete experiments
correspondence_path = 'correspondences_eCLIP_RNACompete.txt'
match = experiment_correpsondence(correspondence_path, experiments)

# directory for encode-clip experiments
dataset_path = '/media/peter/storage/encode_eclip/eclip_datasets'

results = []
for key in match.keys():
    for experiments in match[key]:

        for model in models:
            rbp_index = experiments[0]
            rncmpt_name = experiments[1]
            clip_name = experiments[2]


            print(rbp_index, rncmpt_name, clip_name, model)


            # load rbp dataset
            train, valid, test = helper.load_dataset_hdf5(data_path, ss_type=ss_type, rbp_index=rbp_index)

            # process rbp dataset
            train, valid, test = helper.process_data(train, valid, test, method=normalize_method)

            input_shape = list(train['inputs'].shape)
            input_shape[0] = None
            output_shape = train['targets'].shape
            
            # load clip-seq dataset
            dataset_file_path = os.path.join(dataset_path, clip_name)
            clip_train, clip_valid, clip_test = helper.load_dataset_hdf5(dataset_file_path, ss_type=ss_type)
            X_train = np.vstack([clip_train['inputs'], clip_valid['inputs'], clip_test['inputs']])
            y_train = np.vstack([clip_train['targets'], clip_valid['targets'], clip_test['targets']])
            clip_train = {'inputs': X_train, 'targets': y_train}

            results_path = helper.make_directory('../../results', 'RNAcompete_2013')
            #results_path = helper.make_directory('../../results', 'test')
            file_path = os.path.join(results_path, normalize_method+'_'+ss_type, model, rncmpt_name)
            y_true, max_score, top_score, mean_score = binding_affinity_scores(clip_train, model, input_shape, output_shape, file_path)

            fpr, tpr, thresholds = roc_curve(y_true, mean_score)
            roc_score = auc(fpr, tpr)
            print('mean = ', roc_score)

            fpr, tpr, thresholds = roc_curve(y_true, max_score)
            roc_score = auc(fpr, tpr)
            print('max  = ', roc_score)

            fpr, tpr, thresholds = roc_curve(y_true, top_score)
            roc_score = auc(fpr, tpr)
            print('T20  = ', roc_score)


188 RNCMPT00047 QKI_HepG2_200.h5 affinity_residualbind
mean =  0.808023446887
max  =  0.72570741043
T20  =  0.757196039068
188 RNCMPT00047 QKI_K562_200.h5 affinity_residualbind
mean =  0.838832570359
max  =  0.741458982078
T20  =  0.774387934671
173 RNCMPT00033 IGF2BP2_K562_200.h5 affinity_residualbind
mean =  0.421771528255
max  =  0.450722441466
T20  =  0.448103999806
6 RNCMPT00106 SRSF1_HepG2_200.h5 affinity_residualbind
mean =  0.872922082818
max  =  0.826283206221
T20  =  0.850226238292
7 RNCMPT00107 SRSF1_HepG2_200.h5 affinity_residualbind
mean =  0.872113012887
max  =  0.832236300153
T20  =  0.848043586603
8 RNCMPT00108 SRSF1_HepG2_200.h5 affinity_residualbind
mean =  0.850366300716
max  =  0.828442857103
T20  =  0.834201950615
9 RNCMPT00109 SRSF1_HepG2_200.h5 affinity_residualbind
mean =  0.85080647591
max  =  0.828570353591
T20  =  0.836892236735
11 RNCMPT00110 SRSF1_HepG2_200.h5 affinity_residualbind
mean =  0.759686977384
max  =  0.720179526812
T20  =  0.746078435839
64 RNCM

In [3]:
models = ['affinity_conv_net']
ss_type = 'seq'
normalize_method = 'log_norm' 

# get list of encode-eclip experiments
data_path = '../../data/RNAcompete_2013/rnacompete2013.h5'
experiments = helper.get_experiments_hdf5(data_path)

# get corresponding clip-experiments for rnacompete experiments
correspondence_path = 'correspondences_eCLIP_RNACompete.txt'
match = experiment_correpsondence(correspondence_path, experiments)

# directory for encode-clip experiments
dataset_path = '/media/peter/storage/encode_eclip/eclip_datasets'

results = []
for key in match.keys():
    for experiments in match[key]:

        for model in models:
            rbp_index = experiments[0]
            rncmpt_name = experiments[1]
            clip_name = experiments[2]


            print(rbp_index, rncmpt_name, clip_name, model)


            # load rbp dataset
            train, valid, test = helper.load_dataset_hdf5(data_path, ss_type=ss_type, rbp_index=rbp_index)

            # process rbp dataset
            train, valid, test = helper.process_data(train, valid, test, method=normalize_method)

            input_shape = list(train['inputs'].shape)
            input_shape[0] = None
            output_shape = train['targets'].shape
            
            # load clip-seq dataset
            dataset_file_path = os.path.join(dataset_path, clip_name)
            clip_train, clip_valid, clip_test = helper.load_dataset_hdf5(dataset_file_path, ss_type=ss_type)
            X_train = np.vstack([clip_train['inputs'], clip_valid['inputs'], clip_test['inputs']])
            y_train = np.vstack([clip_train['targets'], clip_valid['targets'], clip_test['targets']])
            clip_train = {'inputs': X_train, 'targets': y_train}

            try:
                results_path = helper.make_directory('../../results', 'RNAcompete_2013')
                #results_path = helper.make_directory('../../results', 'test')
                file_path = os.path.join(results_path, normalize_method+'_'+ss_type, model, rncmpt_name)
                y_true, max_score, top_score, mean_score = binding_affinity_scores(clip_train, model, input_shape, output_shape, file_path)

                fpr, tpr, thresholds = roc_curve(y_true, mean_score)
                roc_score = auc(fpr, tpr)
                print('mean = ', roc_score)

                fpr, tpr, thresholds = roc_curve(y_true, max_score)
                roc_score = auc(fpr, tpr)
                print('max  = ', roc_score)

                fpr, tpr, thresholds = roc_curve(y_true, top_score)
                roc_score = auc(fpr, tpr)
                print('T20  = ', roc_score)
            except:
                print('cannot load model')

188 RNCMPT00047 QKI_HepG2_200.h5 affinity_conv_net
mean =  0.844357713322
max  =  0.785209614775
T20  =  0.80351108074
188 RNCMPT00047 QKI_K562_200.h5 affinity_conv_net
mean =  0.884661579678
max  =  0.815923027392
T20  =  0.838438842998
173 RNCMPT00033 IGF2BP2_K562_200.h5 affinity_conv_net
mean =  0.407670932755
max  =  0.431377144056
T20  =  0.427062777598
6 RNCMPT00106 SRSF1_HepG2_200.h5 affinity_conv_net
mean =  0.875279040944
max  =  0.824015091469
T20  =  0.84511814846
7 RNCMPT00107 SRSF1_HepG2_200.h5 affinity_conv_net
mean =  0.87504183134
max  =  0.826886077222
T20  =  0.844141018317
8 RNCMPT00108 SRSF1_HepG2_200.h5 affinity_conv_net
mean =  0.860124668785
max  =  0.832203011445
T20  =  0.840608667498
9 RNCMPT00109 SRSF1_HepG2_200.h5 affinity_conv_net
mean =  0.858372639239
max  =  0.828812266235
T20  =  0.84013233768
11 RNCMPT00110 SRSF1_HepG2_200.h5 affinity_conv_net
mean =  0.774680049952
max  =  0.697273515166
T20  =  0.72524234988
64 RNCMPT00163 SRSF1_HepG2_200.h5 affinity

In [5]:
models = ['affinity_conv_net']
ss_type = 'pu'
normalize_method = 'log_norm' 

# get list of encode-eclip experiments
data_path = '../../data/RNAcompete_2013/rnacompete2013.h5'
experiments = helper.get_experiments_hdf5(data_path)

# get corresponding clip-experiments for rnacompete experiments
correspondence_path = 'correspondences_eCLIP_RNACompete.txt'
match = experiment_correpsondence(correspondence_path, experiments)

# directory for encode-clip experiments
dataset_path = '/media/peter/storage/encode_eclip/eclip_datasets'

results = []
for key in match.keys():
    for experiments in match[key]:

        for model in models:
            rbp_index = experiments[0]
            rncmpt_name = experiments[1]
            clip_name = experiments[2]


            print(rbp_index, rncmpt_name, clip_name, model)


            # load rbp dataset
            train, valid, test = helper.load_dataset_hdf5(data_path, ss_type=ss_type, rbp_index=rbp_index)

            # process rbp dataset
            train, valid, test = helper.process_data(train, valid, test, method=normalize_method)

            input_shape = list(train['inputs'].shape)
            input_shape[0] = None
            output_shape = train['targets'].shape
            
            # load clip-seq dataset
            dataset_file_path = os.path.join(dataset_path, clip_name)
            clip_train, clip_valid, clip_test = helper.load_dataset_hdf5(dataset_file_path, ss_type=ss_type)
            X_train = np.vstack([clip_train['inputs'], clip_valid['inputs'], clip_test['inputs']])
            y_train = np.vstack([clip_train['targets'], clip_valid['targets'], clip_test['targets']])
            clip_train = {'inputs': X_train, 'targets': y_train}

            try:
                results_path = helper.make_directory('../../results', 'RNAcompete_2013')
                #results_path = helper.make_directory('../../results', 'test')
                file_path = os.path.join(results_path, normalize_method+'_'+ss_type, model, rncmpt_name)
                y_true, max_score, top_score, mean_score = binding_affinity_scores(clip_train, model, input_shape, output_shape, file_path)

                fpr, tpr, thresholds = roc_curve(y_true, mean_score)
                roc_score = auc(fpr, tpr)
                print('mean = ', roc_score)

                fpr, tpr, thresholds = roc_curve(y_true, max_score)
                roc_score = auc(fpr, tpr)
                print('max  = ', roc_score)

                fpr, tpr, thresholds = roc_curve(y_true, top_score)
                roc_score = auc(fpr, tpr)
                print('T20  = ', roc_score)
            except:
                print('cannot load model')

188 RNCMPT00047 QKI_HepG2_200.h5 affinity_conv_net
mean =  0.79026567108
max  =  0.68788224752
T20  =  0.717302133694
188 RNCMPT00047 QKI_K562_200.h5 affinity_conv_net
mean =  0.818731473321
max  =  0.67976864144
T20  =  0.720376881561
173 RNCMPT00033 IGF2BP2_K562_200.h5 affinity_conv_net
mean =  0.406353232205
max  =  0.434313445858
T20  =  0.430295917501
6 RNCMPT00106 SRSF1_HepG2_200.h5 affinity_conv_net
mean =  0.875077985964
max  =  0.834480973188
T20  =  0.85200457547
7 RNCMPT00107 SRSF1_HepG2_200.h5 affinity_conv_net
mean =  0.870816884327
max  =  0.834366850972
T20  =  0.85180454928
8 RNCMPT00108 SRSF1_HepG2_200.h5 affinity_conv_net
mean =  0.851166993357
max  =  0.816116114542
T20  =  0.827378132831
9 RNCMPT00109 SRSF1_HepG2_200.h5 affinity_conv_net
mean =  0.852748537685
max  =  0.819582108399
T20  =  0.831733001338
11 RNCMPT00110 SRSF1_HepG2_200.h5 affinity_conv_net
mean =  0.746717755403
max  =  0.686105263746
T20  =  0.710344293426
64 RNCMPT00163 SRSF1_HepG2_200.h5 affinity

In [3]:

def ensemble_binding_affinity_scores(clip_train, models, input_shape, output_shape, file_path, rncmpt_name, num_ave=20):
    

    results_path = helper.make_directory('../../results', 'RNAcompete_2013')
    
    ensemble_score = 0
    for model in models:
        file_path = os.path.join(results_path, normalize_method+'_'+ss_type, model, rncmpt_name)
        y_true, max_score, top_score, mean_score = binding_affinity_scores(clip_train, model, input_shape, output_shape, file_path)
        ensemble_score += mean_score
        fpr, tpr, thresholds = roc_curve(y_true[:,0], mean_score[:,0])
        roc_score = auc(fpr, tpr)
        print('      ', model, roc_score)

    ensemble_score /= len(models)
    
    return y_true, ensemble_score


In [4]:
models = ['affinity_conv_net', 'affinity_residualbind']
ss_type = 'seq'
normalize_method = 'log_norm' 

# get list of encode-eclip experiments
data_path = '../../data/RNAcompete_2013/rnacompete2013.h5'
experiments = helper.get_experiments_hdf5(data_path)

# get corresponding clip-experiments for rnacompete experiments
correspondence_path = 'correspondences_eCLIP_RNACompete.txt'
match = experiment_correpsondence(correspondence_path, experiments)

# directory for encode-clip experiments
dataset_path = '/media/peter/storage/encode_eclip/eclip_datasets'

results = []
for key in match.keys():
    for experiments in match[key]:
        try:
            rbp_index = experiments[0]
            rncmpt_name = experiments[1]
            clip_name = experiments[2]

            print(rbp_index, rncmpt_name, clip_name)

            # load rbp dataset
            train, valid, test = helper.load_dataset_hdf5(data_path, ss_type=ss_type, rbp_index=rbp_index)

            input_shape = list(train['inputs'].shape)
            input_shape[0] = None
            output_shape = train['targets'].shape

            # load clip-seq dataset
            dataset_file_path = os.path.join(dataset_path, clip_name)
            clip_train, clip_valid, clip_test = helper.load_dataset_hdf5(dataset_file_path, ss_type=ss_type)
            X_train = np.vstack([clip_train['inputs'], clip_valid['inputs'], clip_test['inputs']])
            y_train = np.vstack([clip_train['targets'], clip_valid['targets'], clip_test['targets']])
            clip_train = {'inputs': X_train, 'targets': y_train}

            results_path = helper.make_directory('../../results', 'RNAcompete_2013')
            file_path = os.path.join(results_path, normalize_method+'_'+ss_type)
            y_true, ensemble_score = ensemble_binding_affinity_scores(clip_train, models, input_shape, output_shape, file_path, rncmpt_name, num_ave=20)

            fpr, tpr, thresholds = roc_curve(y_true[:,0], ensemble_score[:,0])
            roc_score = auc(fpr, tpr)
            print('    Ensemble', roc_score)
        except: 
            print('    no corresponding experiment')

188 RNCMPT00047 QKI_HepG2_200.h5
       affinity_conv_net 0.844357713322
       affinity_residualbind 0.834434448241
    Ensemble 0.850037891777
188 RNCMPT00047 QKI_K562_200.h5
       affinity_conv_net 0.884661579678
       affinity_residualbind 0.863534182336
    Ensemble 0.889097503319
173 RNCMPT00033 IGF2BP2_K562_200.h5
       affinity_conv_net 0.407670932755
       affinity_residualbind 0.429468737601
    Ensemble 0.41754169071
6 RNCMPT00106 SRSF1_HepG2_200.h5
       affinity_conv_net 0.875279040944
       affinity_residualbind 0.865529197137
    Ensemble 0.872713017972
7 RNCMPT00107 SRSF1_HepG2_200.h5
       affinity_conv_net 0.87504183134
       affinity_residualbind 0.882229693851
    Ensemble 0.880489127967
8 RNCMPT00108 SRSF1_HepG2_200.h5
       affinity_conv_net 0.860124668785
       affinity_residualbind 0.863691557557
    Ensemble 0.862642265137
9 RNCMPT00109 SRSF1_HepG2_200.h5
       affinity_conv_net 0.858372639239
       affinity_residualbind 0.859354839848
    Ensemble 

In [6]:
models = ['affinity_conv_net', 'affinity_residualbind', 'affinity_all_conv_net']
ss_type = 'seq'
normalize_method = 'log_norm' 

# get list of encode-eclip experiments
data_path = '../../data/RNAcompete_2013/rnacompete2013.h5'
experiments = helper.get_experiments_hdf5(data_path)

# get corresponding clip-experiments for rnacompete experiments
correspondence_path = 'correspondences_eCLIP_RNACompete.txt'
match = experiment_correpsondence(correspondence_path, experiments)

# directory for encode-clip experiments
dataset_path = '/media/peter/storage/encode_eclip/eclip_datasets'

results = []
for key in match.keys():
    for experiments in match[key]:
        try:
            rbp_index = experiments[0]
            rncmpt_name = experiments[1]
            clip_name = experiments[2]

            print(rbp_index, rncmpt_name, clip_name)

            # load rbp dataset
            train, valid, test = helper.load_dataset_hdf5(data_path, ss_type=ss_type, rbp_index=rbp_index)

            input_shape = list(train['inputs'].shape)
            input_shape[0] = None
            output_shape = train['targets'].shape

            # load clip-seq dataset
            dataset_file_path = os.path.join(dataset_path, clip_name)
            clip_train, clip_valid, clip_test = helper.load_dataset_hdf5(dataset_file_path, ss_type=ss_type)
            X_train = np.vstack([clip_train['inputs'], clip_valid['inputs'], clip_test['inputs']])
            y_train = np.vstack([clip_train['targets'], clip_valid['targets'], clip_test['targets']])
            clip_train = {'inputs': X_train, 'targets': y_train}

            results_path = helper.make_directory('../../results', 'RNAcompete_2013')
            file_path = os.path.join(results_path, normalize_method+'_'+ss_type)
            y_true, ensemble_score = ensemble_binding_affinity_scores(clip_train, models, input_shape, output_shape, file_path, rncmpt_name, num_ave=20)

            fpr, tpr, thresholds = roc_curve(y_true[:,0], ensemble_score[:,0])
            roc_score = auc(fpr, tpr)
            print('    Ensemble', roc_score)
        except: 
            print('    no corresponding experiment')

188 RNCMPT00047 QKI_HepG2_200.h5
       affinity_conv_net 0.844357713322
       affinity_residualbind 0.834434448241
    no corresponding experiment
188 RNCMPT00047 QKI_K562_200.h5
       affinity_conv_net 0.884661579678
       affinity_residualbind 0.863534182336
    no corresponding experiment
173 RNCMPT00033 IGF2BP2_K562_200.h5
       affinity_conv_net 0.407670932755
       affinity_residualbind 0.429468737601
    no corresponding experiment
6 RNCMPT00106 SRSF1_HepG2_200.h5
       affinity_conv_net 0.875279040944
       affinity_residualbind 0.865529197137
       affinity_all_conv_net 0.878851147152
    Ensemble 0.87563397353
7 RNCMPT00107 SRSF1_HepG2_200.h5
       affinity_conv_net 0.87504183134
       affinity_residualbind 0.882229693851
    no corresponding experiment
8 RNCMPT00108 SRSF1_HepG2_200.h5
       affinity_conv_net 0.860124668785
       affinity_residualbind 0.863691557557
    no corresponding experiment
9 RNCMPT00109 SRSF1_HepG2_200.h5
       affinity_conv_net 0.858372

In [194]:
models = ['affinity_conv_net', 'affinity_residualbind', 'affinity_all_conv_net']
ss_type = 'seq'
normalize_method = 'log_norm' 

# get list of encode-eclip experiments
data_path = '../../data/RNAcompete_2013/rnacompete2013.h5'
experiments = helper.get_experiments_hdf5(data_path)

# get corresponding clip-experiments for rnacompete experiments
correspondence_path = 'correspondences_eCLIP_RNACompete.txt'
match = experiment_correpsondence(correspondence_path, experiments)

# directory for encode-clip experiments
dataset_path = '/media/peter/storage/encode_eclip/eclip_datasets'

results = []
for key in match.keys():
    for experiments in match[key]:
        rbp_index = experiments[0]
        rncmpt_name = experiments[1]
        clip_name = experiments[2]

        # load rbp dataset
        train, valid, test = helper.load_dataset_hdf5(data_path, ss_type=ss_type, rbp_index=rbp_index)

        # process rbp dataset
        train, valid, test = helper.process_data(train, valid, test, method=normalize_method)

        input_shape = list(train['inputs'].shape)
        input_shape[0] = None
        output_shape = train['targets'].shape

        # load clip-seq dataset
        dataset_file_path = os.path.join(dataset_path, clip_name)
        clip_train, clip_valid, clip_test = helper.load_dataset_hdf5(dataset_file_path, ss_type=ss_type)
        X_train = np.vstack([clip_train['inputs'], clip_valid['inputs'], clip_test['inputs']])
        y_train = np.vstack([clip_train['targets'], clip_valid['targets'], clip_test['targets']])
        clip_train = {'inputs': X_train, 'targets': y_train}

        results_path = helper.make_directory('../../results', 'RNAcompete_2013')
        file_path = os.path.join(results_path, normalize_method+'_'+ss_type)
        y_true, ensemble_score, max_score, top_score, mean_score = ensemble_binding_affinity_scores(clip_train, models, input_shape, output_shape, file_path, rncmpt_name, num_ave=20)

        print(rbp_index, rncmpt_name, clip_name)
        for i, model in enumerate(models):
            print('    '+model)
            fpr, tpr, thresholds = roc_curve(y_true[:,0], top_score[:,i])
            roc_score = auc(fpr, tpr)
            print('      max  = ', roc_score)
        for i, model in enumerate(models):
            print('    '+model)
            fpr, tpr, thresholds = roc_curve(y_true[:,0], max_score[:,i])
            roc_score = auc(fpr, tpr)
            print('      T20  = ', roc_score)
        for i, model in enumerate(models):
            print('    '+model)
            fpr, tpr, thresholds = roc_curve(y_true[:,0], mean_score[:,i])
            roc_score = auc(fpr, tpr)
            print('      mean = ', roc_score)
        print('    Ensemble')
        for i in range(3):
            fpr, tpr, thresholds = roc_curve(y_true[:,0], ensemble_score[:,i])
            roc_score = auc(fpr, tpr)
            print('      ', roc_score)
        results.append([rbp_index, rncmpt_name, clip_name, y_true, ensemble_score, mean_score, top_score, max_score])

188 RNCMPT00047 QKI_HepG2_200.h5
    affinity_conv_net
      max  =  0.633039049273
    affinity_residual_net
      max  =  0.663647560548
    affinity_all_conv_net
      max  =  0.588256390838
    affinity_conv_net
      T20  =  0.623336657151
    affinity_residual_net
      T20  =  0.660090030652
    affinity_all_conv_net
      T20  =  0.586273745418
    affinity_conv_net
      mean =  0.706403929703
    affinity_residual_net
      mean =  0.692101520638
    affinity_all_conv_net
      mean =  0.612376100212
    Ensemble
       0.792572201569
       0.805800220962
       0.848357971576
188 RNCMPT00047 QKI_K562_200.h5
    affinity_conv_net
      max  =  0.626455689824
    affinity_residual_net
      max  =  0.652951509471
    affinity_all_conv_net
      max  =  0.583510364504
    affinity_conv_net
      T20  =  0.623369859655
    affinity_residual_net
      T20  =  0.643572756924
    affinity_all_conv_net
      T20  =  0.58305602751
    affinity_conv_net
      mean =  0.733760746019
 

In [78]:
# get list of rnacompete experiments
data_path = '../../data/RNAcompete_2013/rnacompete2013.h5'
experiments = helper.get_experiments_hdf5(data_path)

# get corresponding clip-experiments for rnacompete experiments
correspondence_path = 'correspondences_eCLIP_RNACompete.txt'
match = experiment_correpsondence(correspondence_path, experiments)

# directory for encode-clip experiments
dataset_path = '/media/peter/storage/encode_eclip/eclip_datasets'

models = ['affinity_residualbind', 'affinity_conv_net', 'affinity_all_conv_net']

normalize_method = 'log_norm' 
ss_types = ['seq', 'pu']


# get list of encode-eclip experiments
data_path = '../../data/RNAcompete_2013/rnacompete2013.h5'
experiments = helper.get_experiments_hdf5(data_path)


# get corresponding clip-experiments for rnacompete experiments
correspondence_path = 'correspondences_eCLIP_RNACompete.txt'
match = experiment_correpsondence(correspondence_path, experiments)

results = []
for key in match.keys():
    for experiments in match[key]:
        rbp_index = experiments[0]
        rcmpt_name = experiments[1]
        clip_name = experiments[2]

        model_score = []
        model_max_score = []
        model_mean_score = []
        for ss_type in ss_types:

            # load rbp dataset
            train, valid, test = helper.load_dataset_hdf5(data_path, ss_type=ss_type, rbp_index=rbp_index)
            input_shape = list(train['inputs'].shape)
            input_shape[0] = None
            output_shape = train['targets'].shape

            # load clip-seq dataset
            dataset_path = '/media/peter/storage/encode_eclip/eclip_datasets'
            dataset_file_path = os.path.join(dataset_path, clip_name)
            clip_train, clip_valid, clip_test = helper.load_dataset_hdf5(dataset_file_path, ss_type=ss_type)
            X_train = np.vstack([clip_train['inputs'], clip_valid['inputs'], clip_test['inputs']])
            y_train = np.vstack([clip_train['targets'], clip_valid['targets'], clip_test['targets']])
            clip_train = {'inputs': X_train, 'targets': y_train}

            for model in models:

                results_path = helper.make_directory('../../results', 'RNAcompete_2013')
                file_path = os.path.join(results_path, normalize_method+'_'+ss_type, model, rncmpt_name)
                y_true, max_score, score, mean_score = binding_affinity_scores(clip_train, model, input_shape, output_shape, file_path)

                model_score.append(score)
                model_mean_score.append(mean_score)
                model_max_score.append(max_score)

                
        print(rcmpt_name, clip_name)
    
        from sklearn.metrics import roc_curve, auc, precision_recall_curve, accuracy_score, roc_auc_score
        print('mean')
        for scores in model_mean_score:
            fpr, tpr, thresholds = roc_curve(y_true, scores)
            roc_score = auc(fpr, tpr)
            print('           ', roc_score)
        fpr, tpr, thresholds = roc_curve(y_true, np.mean(np.array(model_mean_score), axis=0))
        roc_score = auc(fpr, tpr)
        print('  ensemble: ', roc_score)

        print('max')
        for scores in model_max_score:
            fpr, tpr, thresholds = roc_curve(y_true, scores)
            roc_score = auc(fpr, tpr)
            print('           ', roc_score)
        fpr, tpr, thresholds = roc_curve(y_true, np.mean(np.array(model_max_score), axis=0))
        roc_score = auc(fpr, tpr)
        print('  ensemble: ', roc_score)

        print('top-20')
        for scores in model_score:
            fpr, tpr, thresholds = roc_curve(y_true, scores)
            roc_score = auc(fpr, tpr)
            print('           ', roc_score)
        fpr, tpr, thresholds = roc_curve(y_true, np.mean(np.array(model_score), axis=0))
        roc_score = auc(fpr, tpr)
        print('  ensemble: ', roc_score)


        #print("%s\t%s\t%s\t%0.4f\t%0.4f\t%0.4f\t%0.4f"%(rcmpt_name, clip_name, model, mean[0], roc_score, roc_score2, roc_score3))
        results.append([rcmpt_name, clip_name, model, y_true, model_score, model_mean_score, model_max_score])


RNCMPT00047 QKI_HepG2_200.h5
mean
            0.605931070316
            0.491899082325
            0.574297777285
            0.566630991611
            0.447441670493
            0.491939602664
  ensemble:  0.528178358777
max
            0.572428945484
            0.50728274213
            0.556939566517
            0.558528242538
            0.487067188019
            0.508627767518
  ensemble:  0.534039845595
top-20
            0.580591954479
            0.504755621667
            0.563888252112
            0.56589421448
            0.482611232123
            0.500983268298
  ensemble:  0.535400349764
RNCMPT00047 QKI_K562_200.h5
mean
            0.65029020337
            0.522555380822
            0.608755966114
            0.621783529766
            0.470099262068
            0.536473867582
  ensemble:  0.567086651073
max
            0.577495767377
            0.514069561723
            0.559569687861
            0.561743165547
            0.493443423581
            0.506761460075