## Generate comparison plots of sequence logos for different attribution methods


Figures generated from this notebook include:
- Fig. 2d
- Extended Data Fig. 2c and 2d


In [1]:
import os
import numpy as np
from six.moves import cPickle
import matplotlib.pyplot as plt
import logomaker
import pandas as pd
from tfomics import utils
import helper

Using TensorFlow backend.


In [2]:
num_trials = 10
model_names = ['cnn-local', 'cnn-dist']
activations = ['relu', 'exponential', 'sigmoid', 'tanh', 'softplus', 'linear']

results_path = os.path.join('../results', 'task3')
params_path = os.path.join(results_path, 'model_params')
save_path = os.path.join(results_path, 'scores')
plot_path = utils.make_directory(results_path, 'attr_logo_plots')

In [5]:
# load data
data_path = '../data/synthetic_code_dataset.h5'
data = helper.load_data(data_path)
x_train, y_train, x_valid, y_valid, x_test, y_test = data

# load ground truth values
test_model = helper.load_synthetic_models(data_path, dataset='test')
true_index = np.where(y_test[:,0] == 1)[0]
X = x_test[true_index][:500]
X_model = test_model[true_index][:500]

In [10]:
score_names = ['saliency_scores']#, 'mut_scores', 'integrated_scores', 'shap_scores']
activations = ['relu', 'exponential', 'sigmoid', 'tanh', 'softplus', 'linear']

all_scores = {}
for model_name in model_names:
    for activation in activations:
        name = model_name+'_'+activation
        print(name)

        file_path = os.path.join(save_path, name+'.pickle')
        with open(file_path, 'rb') as f:            
            saliency_scores = cPickle.load(f)
            mut_scores = cPickle.load(f)
            integrated_scores = cPickle.load(f)
            shap_scores = cPickle.load(f)

        model_scores = [saliency_scores[0]]#, mut_scores[0], integrated_scores[0], shap_scores[0]]
        scores = []
        for score, score_name in zip(model_scores, score_names):
            if 'mut' in score_name:
                scores.append(np.sqrt(np.sum(score**2, axis=-1, keepdims=True)) * X)
            else:
                scores.append(score * X)
        all_scores[name] = np.array(scores)

    

cnn-local_relu
cnn-local_exponential
cnn-local_sigmoid
cnn-local_tanh
cnn-local_softplus
cnn-local_linear
cnn-dist_relu
cnn-dist_exponential
cnn-dist_sigmoid
cnn-dist_tanh
cnn-dist_softplus
cnn-dist_linear


In [12]:
# load attribution results  (generated from task3_plot_attr_score_comparisons.ipynb)
file_path = os.path.join(results_path, 'task3_attr_results.pickle')
with open(file_path, 'rb') as f:
    results = cPickle.load(f)

# compare attrribution maps for different activation functions

### Compare cnn-dist attribution scores for different activations

In [16]:
sort_index = np.argsort(results['cnn-dist_exponential']['saliency_scores'][0])[::-1]

names = ['Relu', 'Exp', 'Sigmoid', 'Tanh', 'Softplus', 'Linear']
num_plots = 50
for index in sort_index[:num_plots]:
    print(index)
    x = np.ones((len(activations),L,A))*X[index]
    x_model = np.ones((len(activations),A,L))*X_model[index]

    scores = []
    scores.append(all_scores['cnn-dist_relu'][0,index,:,:])
    scores.append(all_scores['cnn-dist_exponential'][0,index,:,:])
    scores.append(all_scores['cnn-dist_sigmoid'][0,index,:,:])
    scores.append(all_scores['cnn-dist_tanh'][0,index,:,:])
    scores.append(all_scores['cnn-dist_softplus'][0,index,:,:])
    scores.append(all_scores['cnn-dist_linear'][0,index,:,:])
    scores = np.array(scores)
    
    roc_score, pr_score = helper.interpretability_performance(x, scores, x_model)
   
    fig = plt.figure(figsize=(25,10))
    N, L, A = X.shape
    for k in range(len(names)):
        counts_df = pd.DataFrame(data=0.0, columns=list('ACGT'), index=list(range(L)))
        for a in range(A):
            for l in range(L):
                counts_df.iloc[l,a] = scores[k,l,a]

        ax = plt.subplot(len(names)+1,1,k+1)
        logomaker.Logo(counts_df, ax=ax)
        ax.yaxis.set_ticks_position('none')
        ax.xaxis.set_ticks_position('none')
        plt.xticks([])
        plt.yticks([])
        fig = plt.gcf()
        plt.ylabel(names[k], fontsize=16)
        ax2 = ax.twinx()
        plt.ylabel(np.round(pr_score[k],4), fontsize=16)
        plt.yticks([])

    counts_df = pd.DataFrame(data=0.0, columns=list('ACGT'), index=list(range(L)))
    w = X_model[index].T
    I = np.log2(4) + np.sum(w * np.log2(w+1e-7), axis=1, keepdims=True)
    logo = I*w
    for a in range(A):
        for l in range(L):
            counts_df.iloc[l,a] = logo[l,a]

    ax = plt.subplot(len(names)+1,1,len(names)+1)
    logomaker.Logo(counts_df, ax=ax)
    plt.ylabel('Truth', fontsize=16)
    ax = plt.gca()
    #ax.spines['right'].set_visible(False)
    #ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    plt.xticks([])
    plt.yticks([])

    outfile = os.path.join(plot_path, 'task3_cnn-deep_logo_activations_'+str(index))
    fig.savefig(outfile, format='pdf', dpi=200, bbox_inches='tight')
    plt.close()
    


1
3
2
8
0
4
9
6
5
7


# compare attribution methods

In [21]:
score_names = ['saliency_scores', 'mut_scores', 'integrated_scores', 'shap_scores']
activations = ['relu', 'exponential']#, 'sigmoid', 'tanh', 'softplus', 'linear']

all_scores = {}
for model_name in model_names:
    for activation in activations:
        name = model_name+'_'+activation
        print(name)

        file_path = os.path.join(save_path, name+'.pickle')
        with open(file_path, 'rb') as f:            
            saliency_scores = cPickle.load(f)
            mut_scores = cPickle.load(f)
            integrated_scores = cPickle.load(f)
            shap_scores = cPickle.load(f)

        model_scores = [saliency_scores[0], mut_scores[0], integrated_scores[0], shap_scores[0]]
        scores = []
        for score, score_name in zip(model_scores, score_names):
            if 'mut' in score_name:
                scores.append(np.sqrt(np.sum(score**2, axis=-1, keepdims=True)) * X)
            else:
                scores.append(score * X)
        all_scores[name] = np.array(scores)    

cnn-local_relu
cnn-local_exponential
cnn-dist_relu
cnn-dist_exponential


In [22]:
roc_score, pr_score = helper.interpretability_performance(X, all_scores['cnn-dist_exponential'][0], X_model)
sort_index = np.argsort(roc_score)[::-1]

names = ['\nSaliency', '\nMutagenesis', 'Integrated\nGradients', '\nDeepSHAP']

N, L, A = x_test.shape
for index in sort_index[:50]:
    print(index)
    x = np.ones((4,L,A))*X[index]
    x_model = np.ones((4,A,L))*X_model[index]

    scores = []
    scores.append(all_scores['cnn-dist_exponential'][0,index,:,:])
    scores.append(all_scores['cnn-dist_exponential'][1,index,:,:])
    scores.append(all_scores['cnn-dist_exponential'][2,index,:,:])
    scores.append(all_scores['cnn-dist_exponential'][3,index,:,:])
    scores = np.array(scores)
    
    roc_score, pr_score = helper.interpretability_performance(x, scores, x_model)
   
    fig = plt.figure(figsize=(25,10))
    # plot in silico mutagenesis for sequences with top predictions
    N, L, A = X.shape
    for k in range(4):
        counts_df = pd.DataFrame(data=0.0, columns=list('ACGT'), index=list(range(L)))
        for a in range(A):
            for l in range(L):
                counts_df.iloc[l,a] = scores[k,l,a]

        ax = plt.subplot(5,1,k+1)
        logomaker.Logo(counts_df, ax=ax)
        ax.yaxis.set_ticks_position('none')
        ax.xaxis.set_ticks_position('none')
        plt.xticks([])
        plt.yticks([])
        fig = plt.gcf()
        plt.ylabel(names[k], fontsize=16)
        ax2 = ax.twinx()
        plt.ylabel(np.round(pr_score[k],4), fontsize=16)
        plt.yticks([])

    counts_df = pd.DataFrame(data=0.0, columns=list('ACGT'), index=list(range(L)))
    w = X_model[index].T
    I = np.log2(4) + np.sum(w * np.log2(w+1e-7), axis=1, keepdims=True)
    logo = I*w
    for a in range(A):
        for l in range(L):
            counts_df.iloc[l,a] = logo[l,a]

    ax = plt.subplot(5,1,5)
    logomaker.Logo(counts_df, ax=ax)
    plt.ylabel('Truth', fontsize=16)
    ax = plt.gca()
    #ax.spines['right'].set_visible(False)
    #ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    plt.xticks([])
    plt.yticks([])

    outfile = os.path.join(plot_path, 'task3_cnn-dist_exp_methods_logo_'+str(index))
    fig.savefig(outfile, format='pdf', dpi=200, bbox_inches='tight')
    plt.close()
    
    
    scores = []
    scores.append(all_scores['cnn-dist_relu'][0,index,:,:])
    scores.append(all_scores['cnn-dist_relu'][1,index,:,:])
    scores.append(all_scores['cnn-dist_relu'][2,index,:,:])
    scores.append(all_scores['cnn-dist_relu'][3,index,:,:])
    scores = np.array(scores)
    
    roc_score, pr_score = helper.interpretability_performance(x, scores, x_model)
   
    fig = plt.figure(figsize=(25,10))
    # plot in silico mutagenesis for sequences with top predictions
    N, L, A = X.shape
    for k in range(4):
        counts_df = pd.DataFrame(data=0.0, columns=list('ACGT'), index=list(range(L)))
        for a in range(A):
            for l in range(L):
                counts_df.iloc[l,a] = scores[k,l,a]

        ax = plt.subplot(5,1,k+1)
        logomaker.Logo(counts_df, ax=ax)
        ax.yaxis.set_ticks_position('none')
        ax.xaxis.set_ticks_position('none')
        plt.xticks([])
        plt.yticks([])
        fig = plt.gcf()
        plt.ylabel(names[k], fontsize=16)
        ax2 = ax.twinx()
        
        plt.ylabel(np.round(pr_score[k],4), fontsize=16)
        plt.yticks([])

    counts_df = pd.DataFrame(data=0.0, columns=list('ACGT'), index=list(range(L)))
    w = X_model[index].T
    I = np.log2(4) + np.sum(w * np.log2(w+1e-7), axis=1, keepdims=True)
    logo = I*w
    for a in range(A):
        for l in range(L):
            counts_df.iloc[l,a] = logo[l,a]

    ax = plt.subplot(5,1,5)
    logomaker.Logo(counts_df, ax=ax)
    plt.ylabel('Truth', fontsize=16)
    ax = plt.gca()
    #ax.spines['right'].set_visible(False)
    #ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    plt.xticks([])
    plt.yticks([])

    outfile = os.path.join(plot_path, 'task3_cnn-dist_relu_methods_logo_'+str(index))
    fig.savefig(outfile, format='pdf', dpi=200, bbox_inches='tight')
    plt.close()
    
    


6
3
15
4
7
10
13
5
14
8
1
12
17
11
16
9
18
2
0
19
