In [1]:
import numpy as np

from tensorflow.keras.models import load_model

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
sns.set(font_scale = 2)
sns.set_style("white")
rcParams['svg.fonttype'] = 'none'

from scipy.stats import pearsonr,spearmanr

import pandas as pd

In [2]:
regression_models = {"LCL":              load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_lcl/model_11_hg19_tf.hdf5", compile=False),
                     "Microglia":        load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_gosselin_microglia/model_7_tf.hdf5", compile=False),
                     "Neuron":           load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_fullard_neun/model_2_tf.hdf5", compile=False),
                     "Monocyte":         load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_monocyte/model_8_tf.hdf5", compile=False),
                     "mono_scatac":      load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/scatac_snigdha_models_tf_versions/mono_subset_model_tf.hdf5", compile=False),
                     "dendritic_scatac": load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/scatac_snigdha_models_tf_versions/dendritic_subset_model_tf.hdf5", compile=False),
                     "nkCells_scatac":   load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/scatac_snigdha_models_tf_versions/nkCells_subset_model_tf.hdf5", compile=False),
                     "basophil_scatac":  load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/scatac_snigdha_models_tf_versions/basophil_subset_model_tf.hdf5", compile=False),
                     "progen_scatac":    load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/scatac_snigdha_models_tf_versions/progen_subset_model_tf.hdf5", compile=False),
                     "cd4_scatac":       load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/scatac_snigdha_models_tf_versions/cd4_subset_model_tf.hdf5", compile=False),
                     "cd8_scatac":       load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/scatac_snigdha_models_tf_versions/cd8_subset_model_tf.hdf5", compile=False),
                     "B_cells_scatac":   load_model("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/scatac_snigdha_models_tf_versions/B_cells_subset_model_tf.hdf5", compile=False)
                    }

In [16]:
regression_models["mono_scatac"].summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 493, 500)          16500     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 123, 500)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 123, 500)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 116, 250)          1000250   
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 29, 250)           0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 29, 250)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 22, 100)          

In [14]:
regression_models["mono_scatac"].summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 493, 500)          16500     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 123, 500)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 123, 500)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 116, 250)          1000250   
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 29, 250)           0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 29, 250)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 22, 100)          

In [5]:
for layer in regression_models["mono_scatac"].layers:
    print(layer.get_config())

{'name': 'conv1d_1', 'trainable': True, 'batch_input_shape': (None, 500, 4), 'dtype': 'float32', 'filters': 500, 'kernel_size': (8,), 'strides': (1,), 'padding': 'valid', 'data_format': 'channels_last', 'dilation_rate': (1,), 'groups': 1, 'activation': 'relu', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': {'class_name': 'L1L2', 'config': {'l1': 0.0, 'l2': 0.0010000000474974513}}, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}
{'name': 'max_pooling1d_1', 'trainable': True, 'dtype': 'float32', 'strides': (4,), 'pool_size': (4,), 'padding': 'valid', 'data_format': 'channels_last'}
{'name': 'dropout_1', 'trainable': True, 'dtype': 'float32', 'rate': 0.25, 'noise_shape': None, 'seed': None}
{'name': 'conv1d_2', 'trainable': True, 'dtype':

In [3]:
validation_sets_regression = {"LCL":              np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_lcl/chr4_summit_centered_validation_set_hg19_X.npy"),          
                              "Microglia":        np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_gosselin_microglia/chr4_summit_centered_validation_set_X.npy"),
                              "Neuron":           np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_fullard_neun/chr4_summit_centered_validation_set_X.npy"),
                              "Monocyte":         np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_monocyte/chr4_summit_centered_validation_set_X.npy"),
                              "mono_scatac":      np.load("/home/snigdhaa/diff_acess/Archr/mono/subsetPeaks_validationInput.npy"),
                              "dendritic_scatac": np.load("/home/snigdhaa/diff_acess/Archr/dendritic/subsetPeaks_validationInput.npy"),
                              "nkCells_scatac":   np.load("/home/snigdhaa/diff_acess/Archr/nkCells/subsetPeaks_validationInput.npy"),
                              "basophil_scatac":  np.load("/home/snigdhaa/diff_acess/Archr/basophil/subsetPeaks_validationInput.npy"),
                              "progen_scatac":    np.load("/home/snigdhaa/diff_acess/Archr/progen/subsetPeaks_validationInput.npy"),
                              "cd4_scatac":       np.load("/home/snigdhaa/diff_acess/Archr/cd4/subsetPeaks_validationInput.npy"),
                              "cd8_scatac":       np.load("/home/snigdhaa/diff_acess/Archr/cd8/subsetPeaks_validationInput.npy"),
                              "B_cells_scatac":   np.load("/home/snigdhaa/diff_acess/Archr/B_cells/subsetPeaks_validationInput.npy")
                             }

In [4]:
validation_sets_regression_labels = {"LCL":              np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_lcl/chr4_summit_centered_validation_set_hg19_Y.npy"),          
                                     "Microglia":        np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_gosselin_microglia/chr4_summit_centered_validation_set_Y.npy"),
                                     "Neuron":           np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_fullard_neun/chr4_summit_centered_validation_set_Y.npy"),
                                     "Monocyte":         np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_monocyte/chr4_summit_centered_validation_set_Y.npy"),
                                     "mono_scatac":      np.load("/home/snigdhaa/diff_acess/Archr/mono/subsetPeaks_validationLabels.npy"),
                                     "dendritic_scatac": np.load("/home/snigdhaa/diff_acess/Archr/dendritic/subsetPeaks_validationLabels.npy"),
                                     "nkCells_scatac":   np.load("/home/snigdhaa/diff_acess/Archr/nkCells/subsetPeaks_validationLabels.npy"),
                                     "basophil_scatac":  np.load("/home/snigdhaa/diff_acess/Archr/basophil/subsetPeaks_validationLabels.npy"),
                                     "progen_scatac":    np.load("/home/snigdhaa/diff_acess/Archr/progen/subsetPeaks_validationLabels.npy"),
                                     "cd4_scatac":       np.load("/home/snigdhaa/diff_acess/Archr/cd4/subsetPeaks_validationLabels.npy"),
                                     "cd8_scatac":       np.load("/home/snigdhaa/diff_acess/Archr/cd8/subsetPeaks_validationLabels.npy"),
                                     "B_cells_scatac":   np.load("/home/snigdhaa/diff_acess/Archr/B_cells/subsetPeaks_validationLabels.npy")
                                    }

In [5]:
test_sets_regression = {"LCL":              np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_lcl/chr8_9_summit_centered_test_set_hg19_X.npy"),          
                        "Microglia":        np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_gosselin_microglia/chr8_9_summit_centered_test_set_X.npy"),
                        "Neuron":           np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_fullard_neun/chr8_9_summit_centered_test_set_X.npy"),
                        "Monocyte":         np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_monocyte/chr8_9_summit_centered_test_set_X.npy"),
                        "mono_scatac":      np.load("/home/snigdhaa/diff_acess/Archr/mono/subsetPeaks_testInput.npy"),
                        "dendritic_scatac": np.load("/home/snigdhaa/diff_acess/Archr/dendritic/subsetPeaks_testInput.npy"),
                        "nkCells_scatac":   np.load("/home/snigdhaa/diff_acess/Archr/nkCells/subsetPeaks_testInput.npy"),
                        "basophil_scatac":  np.load("/home/snigdhaa/diff_acess/Archr/basophil/subsetPeaks_testInput.npy"),
                        "progen_scatac":    np.load("/home/snigdhaa/diff_acess/Archr/progen/subsetPeaks_testInput.npy"),
                        "cd4_scatac":       np.load("/home/snigdhaa/diff_acess/Archr/cd4/subsetPeaks_testInput.npy"),
                        "cd8_scatac":       np.load("/home/snigdhaa/diff_acess/Archr/cd8/subsetPeaks_testInput.npy"),
                        "B_cells_scatac":   np.load("/home/snigdhaa/diff_acess/Archr/B_cells/subsetPeaks_testInput.npy")
                        }

In [6]:
test_sets_regression_labels = {"LCL":              np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_lcl/chr8_9_summit_centered_test_set_hg19_Y.npy"),          
                               "Microglia":        np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_gosselin_microglia/chr8_9_summit_centered_test_set_Y.npy"),
                               "Neuron":           np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_fullard_neun/chr8_9_summit_centered_test_set_Y.npy"),
                               "Monocyte":         np.load("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_monocyte/chr8_9_summit_centered_test_set_Y.npy"),
                               "mono_scatac":      np.load("/home/snigdhaa/diff_acess/Archr/mono/subsetPeaks_testLabels.npy"),
                               "dendritic_scatac": np.load("/home/snigdhaa/diff_acess/Archr/dendritic/subsetPeaks_testLabels.npy"),
                               "nkCells_scatac":   np.load("/home/snigdhaa/diff_acess/Archr/nkCells/subsetPeaks_testLabels.npy"),
                               "basophil_scatac":  np.load("/home/snigdhaa/diff_acess/Archr/basophil/subsetPeaks_testLabels.npy"),
                               "progen_scatac":    np.load("/home/snigdhaa/diff_acess/Archr/progen/subsetPeaks_testLabels.npy"),
                               "cd4_scatac":       np.load("/home/snigdhaa/diff_acess/Archr/cd4/subsetPeaks_testLabels.npy"),
                               "cd8_scatac":       np.load("/home/snigdhaa/diff_acess/Archr/cd8/subsetPeaks_testLabels.npy"),
                               "B_cells_scatac":   np.load("/home/snigdhaa/diff_acess/Archr/B_cells/subsetPeaks_testLabels.npy")
                               }

In [7]:
validation_sets_predictions = {}
test_sets_predictions = {}

for cell_type in test_sets_regression:
    validation_sets_predictions[cell_type] = regression_models[cell_type].predict(validation_sets_regression[cell_type])
    test_sets_predictions[cell_type] = regression_models[cell_type].predict(test_sets_regression[cell_type])

In [8]:
bulk_ocr_cell_type_names = {"LCL",
                            "Microglia",
                            "Neuron",
                            "Monocyte"
                           }

In [9]:
replicateCorrelationInFiles = {"LCL": "/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/lcl_dnase_encode/atac/5840747c-53d2-4b55-8091-72684bd9fc7f/call-idr/shard-0/execution/rep1_rep2.idr0.05.thresholded-peaks.bfilt.txt.gz",
           "Monocyte": "/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_human_monocyte_brain/furtherProcessedPeaks/humanMonocyteDHS/peak/macs2/idr/pooled_pseudo_reps/humanMonocyteDHS_ppr.IDR0.1.thresholded-peaks.bfilt.txt.gz",
           "Microglia": "/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/gosselin_microglia_atac/exvivo_idr_pipeline_out/atac/0c4b0010-a158-4176-a424-059a4fb12cef/call-idr_ppr/execution/ppr.idr0.05.thresholded-peaks.bfilt.txt.gz",
           "Neuron": "/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ml_fullard_neun/ppr.idr0.1.thresholded-peaks.bfilt.txt.gz"
          }

repNotPpr = {"LCL": True,
             "Monocyte": False,
             "Microglia": False,
             "Neuron": False
            }

In [30]:
def plotRegressionModelPerformance(true_signal, predicted_signal, cell_type, label):
    p = sns.jointplot(x=true_signal.flatten(),
                      y=predicted_signal.flatten(),
                      kind='hex',
                      color='#1f77b4',
                      #rasterized=True
                     ).plot_joint(sns.kdeplot,
                                  color='#1f77b4',
                                  linewidths=0.5
                                 )
    
    p.ax_marg_x.remove()
    p.ax_marg_y.remove()
    
    print(true_signal.shape)
    r, rp = pearsonr(true_signal.flatten(), predicted_signal.flatten())
    rho, rhop = spearmanr(true_signal.flatten(), predicted_signal.flatten())
    
    p.set_axis_labels("Signal value", "Predicted signal")
    
    p.ax_joint.annotate("r={:0.2}, p={:0.2}".format(r,rp)+ "\n" + r"$\rho$={:0.2}, p={:0.2}".format(rho, rhop),
                        xy=(0.5,0.1),
                        xycoords="axes fraction",
                        ha="left",
                        va="center",
                        bbox={'fc': 'none', 'ec': 'none'},
                        color='red',
                        fontsize=15,
                        weight='bold'
                       )

    plt.title(" ".join([cell_type, label, "set"]))
    plt.savefig("_".join([cell_type,"regression",label,"performance"])+".svg")
    plt.close()

In [31]:
def plotReplicateCorrelations(inFiles, repNotPpr, cell_type, label):
    inFile = inFiles[cell_type]
    replicate_data = pd.read_csv(inFile,
                                 sep="\t", compression="gzip",
                                 names=["CHR",
                                 "START",
                                 "END",
                                 "NAME",
                                 "SCORE",
                                 "STRAND",
                                 "SIGNAL",
                                 "P",
                                 "Q",
                                 "SUMMIT",
                                 "LOCALIDR",
                                 "GLOBALIDR",
                                 "REP1_START",
                                 "REP1_END",
                                 "REP1_SIGNAL",
                                 "REP1_SUMMIT",
                                 "REP2_START",
                                 "REP2_END",
                                 "REP2_SIGNAL",
                                 "REP2_SUMMIT"]
                                )
    
    labelToChrMapping = {"validation": ["chr4"],
                         "test": ["chr8", "chr9"],
                        }

    
    
    subsetted_replicate_data = replicate_data.loc[replicate_data["CHR"].isin(labelToChrMapping[label])]    
    print(subsetted_replicate_data.shape[0])
    addStr = "ppr"
    if repNotPpr[cell_type]:
        addStr = "rep"

        
    p = sns.jointplot(x=subsetted_replicate_data["REP1_SIGNAL"],
                      y=subsetted_replicate_data["REP2_SIGNAL"],
                      kind='hex',
                      color='#1f77b4',
                     ).plot_joint(sns.kdeplot,
                                  color='#1f77b4',
                                  linewidths=0.5
                                 )

    p.ax_marg_x.remove()
    p.ax_marg_y.remove()
    
    r, rp = pearsonr(subsetted_replicate_data["REP1_SIGNAL"], subsetted_replicate_data["REP2_SIGNAL"])
    rho, rhop = spearmanr(subsetted_replicate_data["REP1_SIGNAL"], subsetted_replicate_data["REP2_SIGNAL"])
    
    p.ax_joint.annotate("r={:0.2}, p={:0.2}".format(r,rp)+ "\n" + r"$\rho$={:0.2}, p={:0.2}".format(rho, rhop),
                        xy=(0.5,0.1),
                        xycoords="axes fraction",
                        ha="left",
                        va="center",
                        bbox={'fc': 'none', 'ec': 'none'},
                        color='red',
                        fontsize=15,
                        weight='bold'
                       )    
    p.set_axis_labels("Signal ("+cell_type+" "+addStr+"1)", "Signal ("+cell_type+" "+addStr+"2)")
    plt.title(" ".join([cell_type, addStr, "correlations", label, "set"]))
    plt.savefig("_".join([cell_type,addStr+"_correlations",label])+".svg")
    plt.close()

In [32]:
for cell_type in test_sets_regression:
    plotRegressionModelPerformance(validation_sets_predictions[cell_type],
                                   validation_sets_regression_labels[cell_type],
                                   cell_type,
                                   "validation")
    if cell_type in bulk_ocr_cell_type_names:
        plotReplicateCorrelations(replicateCorrelationInFiles, repNotPpr, cell_type, "validation")    
    
    plotRegressionModelPerformance(test_sets_predictions[cell_type],
                                   test_sets_regression_labels[cell_type],
                                   cell_type,
                                   "test")    

    if cell_type in bulk_ocr_cell_type_names:
        plotReplicateCorrelations(replicateCorrelationInFiles, repNotPpr, cell_type, "test")    


(3894, 1)
3894
(9196, 1)
9196
(8888, 1)
8888
(15982, 1)
15982
(13557, 1)
13557
(17003, 1)
17003
(4684, 1)
4684
(10513, 1)
10513
(29615, 1)
(13700, 1)
(33780, 1)
(15362, 1)
(17673, 1)
(7822, 1)
(12601, 1)
(5422, 1)
(36164, 1)
(16481, 1)
(24603, 1)
(10709, 1)
(22980, 1)
(9906, 1)
(20894, 1)
(9643, 1)
