In [6]:
from __future__ import print_function 
import os, sys, h5py
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
from six.moves import cPickle
from sklearn.metrics import roc_curve, auc, precision_recall_curve, accuracy_score, roc_auc_score

sys.path.append('..')
import helper
from deepomics import neuralnetwork as nn
from deepomics import utils, fit

In [7]:



def experiment_correpsondence(correspondence_path, experiments):
    rncmpt_names = []
    clip_names = []
    rbp_names = []
    cell_types = []
    with open(correspondence_path, 'rb') as f:
        for line in f:
            index = line.index('-')
            rncmpt_names.append(line[:index])
            experiment = line[index+1:].split()[0]
            index = experiment.index('_')
            rbp_names.append(experiment[:index])
            cell_types.append(experiment[index+1:])
            clip_names.append(experiment+'_200.h5')
            
    # get a dictionary of tuples for each rbp correspondence
    unique_rbps = np.unique(rbp_names)
    match = {}
    for j, rbp_name in enumerate(unique_rbps):
        match[rbp_name] = []
    for j, rbp_name in enumerate(rbp_names):
        rbp_index = np.where(experiments==rncmpt_names[j])[0][0]
        match[rbp_name].append((rbp_index, rncmpt_names[j], clip_names[j]))
    return match

def binding_affinity_scores(sess, nntrainer, clip_train, num_average=1):

    X_train = clip_train['inputs']
    y_train = clip_train['targets']

    index = np.where(y_train[:,0]==1)[0]
    X_pos = X_train[index]
    y_pos = y_train[index]
    index = np.where(y_train[:,0]==0)[0]
    X_neg = X_train[index]
    y_neg = y_train[index]

    num_split = X_pos.shape[1] - 41

    pos_score = []
    for X in X_pos:
        X_split = []
        for i in range(num_split):
            X_split.append([X[i:i+41,:,:]])
        X_split = np.vstack(X_split)

        affinity = nntrainer.get_activations(sess, {'inputs': X_split})
        affinity = affinity[::-1]
        pos_score.append(np.mean(affinity[:num_average]))
    pos_score = np.array(pos_score)

    neg_score = []
    for X in X_neg:
        X_split = []
        for i in range(num_split):
            X_split.append([X[i:i+41,:,:]])
        X_split = np.vstack(X_split)

        affinity = nntrainer.get_activations(sess, {'inputs': X_split})
        affinity = affinity[::-1]
        neg_score.append(np.mean(affinity[:num_average]))
    neg_score = np.array(neg_score)

    y_true = np.vstack([np.ones((len(pos_score),1)), np.zeros((len(neg_score),1))])
    y_score = np.vstack([np.expand_dims(pos_score,axis=1), np.expand_dims(neg_score, axis=1)])

    return y_true, y_score


In [8]:

# get list of encode-eclip experiments
data_path = '../../data/RNAcompete_2013/rnacompete2013.h5'
experiments = helper.get_experiments_hdf5(data_path)

# get corresponding clip-experiments for rnacompete experiments
correspondence_path = 'correspondences_eCLIP_RNACompete.txt'
match = experiment_correpsondence(correspondence_path, experiments)
    

In [9]:
model = 'affinity_conv_net'
rbp = 'QKI'
rbp_index = 188
rncmpt_name = 'RNCMPT00047'
clip_name = 'RPS3_K562_200.h5'

normalize_method = 'log_norm' 
ss_type = 'seq'

# load rbp dataset
train, valid, test = helper.load_dataset_hdf5(data_path, ss_type=ss_type, rbp_index=rbp_index)

# process rbp dataset
train, valid, test = helper.process_data(train, valid, test, method=normalize_method)

# get shapes
input_shape = list(train['inputs'].shape)
input_shape[0] = None
output_shape = train['targets'].shape

# load model
genome_model = helper.import_model(model)
model_layers, optimization = genome_model.model(input_shape, output_shape)

# build neural network class
nnmodel = nn.NeuralNet(seed=247)
nnmodel.build_layers(model_layers, optimization, use_scope=False)

results_path = helper.make_directory('../../results', 'RNAcompete_2013')
file_path = os.path.join(results_path, normalize_method+'_'+ss_type, model, rncmpt_name)
nntrainer = nn.NeuralTrainer(nnmodel, save='best', file_path=file_path)

# initialize session
sess = utils.initialize_session(nnmodel.placeholders)

# load best model
nntrainer.set_best_parameters(sess)
loss, mean, std = nntrainer.test_model(sess, test)

dataset_path = '/media/peter/storage/encode_eclip/eclip_datasets'
dataset_file_path = os.path.join(dataset_path, clip_name)
clip_train, clip_valid, clip_test = helper.load_dataset_hdf5(dataset_file_path, ss_type=ss_type)

X_train = clip_train['inputs']
y_train = clip_train['targets']

index = np.where(y_train[:,0]==1)[0]
X_pos = X_train[index]
y_pos = y_train[index]
index = np.where(y_train[:,0]==0)[0]
X_neg = X_train[index]
y_neg = y_train[index]

num_split = X_pos.shape[1] - 41

pos_score = []
for X in X_pos:
    X_split = []
    for i in range(num_split):
        X_split.append([X[i:i+41,:,:]])
    X_split = np.vstack(X_split)
    affinity = nntrainer.get_activations(sess, {'inputs': X_split})
    pos_score.append(np.max(affinity))
pos_score = np.array(pos_score)

neg_score = []
for X in X_neg:
    X_split = []
    for i in range(num_split):
        X_split.append([X[i:i+41,:,:]])
    X_split = np.vstack(X_split)

    affinity = nntrainer.get_activations(sess, {'inputs': X_split})
    neg_score.append(np.max(affinity))
neg_score = np.array(neg_score)

y_true = np.vstack([np.ones((len(pos_score),1)), np.zeros((len(neg_score),1))])
y_score = np.vstack([np.expand_dims(pos_score,axis=1), np.expand_dims(neg_score, axis=1)])

from sklearn.metrics import roc_curve, auc, precision_recall_curve, accuracy_score, roc_auc_score
fpr, tpr, thresholds = roc_curve(y_true, y_score)

roc_score = auc(fpr, tpr)
print(roc_score)

InternalError: Failed to create session.

In [None]:
models = ['deep_residual_model', 'conv_net']
normalize_method = 'log_norm' 
ss_type = 'seq'
best_path = '../../results/RNAcompete_2013/'+normalize_method+'_'+ss_type

# get list of rnacompete experiments
data_path = '../../data/RNAcompete_2013/rnacompete2013.h5'
experiments = helper.get_experiments_hdf5(data_path)

# get corresponding clip-experiments for rnacompete experiments
correspondence_path = 'correspondences_eCLIP_RNACompete.txt'
match = experiment_correpsondence(correspondence_path, experiments)

# directory for encode-clip experiments
dataset_path = '/media/peter/storage/encode_eclip/eclip_datasets'


results = []
for key in match.keys():
    for experiments in match[key]:
        rbp_index = experiments[0]
        rcmpt_name = experiments[1]
        clip_name = experiments[2]

        # load rbp dataset
        train, valid, test = helper.load_dataset_hdf5(data_path, ss_type=ss_type, rbp_index=rbp_index)

        # process rbp dataset
        train, valid, test = helper.process_data(train, valid, test, method=normalize_method)
        
        # get shapes
        input_shape = list(train['inputs'].shape)
        input_shape[0] = None
        output_shape = train['targets'].shape

        # load clip dataset
        dataset_file_path = os.path.join(dataset_path, clip_name)
        clip_train, clip_valid, clip_test = helper.load_dataset_hdf5(dataset_file_path, ss_type=ss_type)

        for model in models:
            
            # load model
            genome_model = helper.import_model(model)
            model_layers, optimization = genome_model.model(input_shape, output_shape)

            # build neural network class
            nnmodel = nn.NeuralNet(seed=247)
            nnmodel.build_layers(model_layers, optimization, use_scope=False)

            file_path = os.path.join(best_path, model, rncmpt_name)
            nntrainer = nn.NeuralTrainer(nnmodel, save='best', file_path=file_path)

            # initialize session
            sess = utils.initialize_session(nnmodel.placeholders)

            # load best model
            nntrainer.set_best_parameters(sess)
            
            # loss, mean, std = nntrainer.test_model(sess, test)

            # get scores for each protein
            y_true, y_score = binding_affinity_scores(sess, nntrainer, clip_train, num_average=1)

            fpr, tpr, thresholds = roc_curve(y_true, y_score)
            roc_score = auc(fpr, tpr)

            results.append([rcmpt_name, experiment, model, roc_score, mean[0]])
            
            print("%s\t%s\tR=%0.4f\t%s\t%0.4f"%(rcmpt_name, experiment, mean[0],model, roc_score))
            

RNCMPT00020	FXR2_K562	R=0.4383	deep_residual_model	0.6079
RNCMPT00020	FXR2_K562	R=0.4301	conv_net	0.6111
RNCMPT00047	QKI_HepG2	R=0.0301	deep_residual_model	0.7631


In [23]:
results

[['RNCMPT00020',
  'FXR2_K562',
  'deep_residual_model',
  0.60788912267989526,
  0.43834302],
 ['RNCMPT00020', 'FXR2_K562', 'conv_net', 0.61105647214016312, 0.43006411],
 ['RNCMPT00047',
  'QKI_HepG2',
  'deep_residual_model',
  0.76311873507671757,
  0.030087756],
 ['RNCMPT00047', 'QKI_HepG2', 'conv_net', 0.81096601386601619, 0.032569751],
 ['RNCMPT00047',
  'QKI_K562',
  'deep_residual_model',
  0.82345037098642127,
  0.030087756],
 ['RNCMPT00047', 'QKI_K562', 'conv_net', 0.87599354217227554, 0.032569751],
 ['RNCMPT00022',
  'HNRNPA1_HepG2',
  'deep_residual_model',
  0.81437269668322387,
  0.85029775],
 ['RNCMPT00022', 'HNRNPA1_HepG2', 'conv_net', 0.79125941355551987, 0.82069033],
 ['RNCMPT00022',
  'HNRNPA1_K562',
  'deep_residual_model',
  0.86566108555142396,
  0.85029775],
 ['RNCMPT00022', 'HNRNPA1_K562', 'conv_net', 0.83112399599448195, 0.82069033],
 ['RNCMPT00106',
  'SRSF1_HepG2',
  'deep_residual_model',
  0.83136619255020228,
  0.74718547],
 ['RNCMPT00106', 'SRSF1_HepG2', 