In [None]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
sns.set_palette(['#1E1E1E', '#BB3524', '#F5D54A', '#384827', '#282F44'])
sns.set_context('paper')
sns.set_style({'axes.axisbelow': True, 
               'axes.edgecolor': '.15',
               'axes.facecolor': 'white',
               'axes.grid': False, 
               'axes.labelcolor': '.15', 
               'figure.facecolor': 'white', 
               'grid.color': '.15',
               'grid.linestyle': ':', 
               'grid.alpha': .5, 
               'image.cmap': 'Greys', 
               'legend.frameon': False, 
               'legend.numpoints': 1, 
               'legend.scatterpoints': 1,
               'lines.solid_capstyle': 'butt', 
               'axes.spines.right': False, 
               'axes.spines.top': False,  
               'text.color': '.15',  
               'xtick.top': False, 
               'ytick.right': False, 
               'xtick.color': '.15',
               'xtick.direction': 'out', 
               'ytick.color': '.15', 
               'ytick.direction': 'out', 
              })


import matplotlib

FONT_SIZE_PT = 5
matplotlib.rcParams['font.family'] = 'Arial'
matplotlib.rcParams['font.size'] = FONT_SIZE_PT
matplotlib.rcParams['axes.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['axes.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['figure.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['xtick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['ytick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.fontsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.title_fontsize'] = FONT_SIZE_PT

matplotlib.rcParams['xtick.major.size'] = matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = matplotlib.rcParams['ytick.major.width'] = 0.5


matplotlib.rcParams['xtick.minor.size'] = matplotlib.rcParams['ytick.minor.size'] = 1

matplotlib.rcParams['xtick.minor.width'] = matplotlib.rcParams['ytick.minor.width'] = 0.5

matplotlib.rcParams['axes.linewidth'] = 0.5
matplotlib.rcParams['lines.linewidth'] = 0.5
matplotlib.rcParams['grid.linewidth'] = 0.25
matplotlib.rcParams['patch.linewidth'] = 0.25
matplotlib.rcParams['lines.markeredgewidth'] = 0.25
matplotlib.rcParams['lines.markersize'] = 2

FIVE_MM_IN_INCH = 0.19685
DPI = 600
matplotlib.rcParams['figure.figsize'] = (10 * FIVE_MM_IN_INCH, 9 * FIVE_MM_IN_INCH)
matplotlib.rcParams['savefig.dpi'] = DPI
matplotlib.rcParams['figure.dpi'] = DPI // 4


#http://phyletica.org/matplotlib-fonts/
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

pd.set_option("display.max_columns", 200)

In [None]:
import helpers
import importlib
importlib.reload(helpers)

from helpers import *

In [None]:
HAVE_SNAKEMAKE = 'snakemake' in locals()



if HAVE_SNAKEMAKE:
    
    input_peaklists = snakemake.input.peaklists
    input_stats = snakemake.input.stats
    
    input_marcs_interaction_data = snakemake.input.marcs_interaction_data
    input_marcs_gene_name_map = snakemake.input.marcs_gene_name_map
    
    param_cell_line = str(snakemake.params.cell_line)
    
    output_plot_dir = snakemake.output.plot_dir
    
        
    param_analysis_mode = snakemake.params['analysis_mode']
    
    param_correlation_method = snakemake.params['correlation_method']
    
else:
    print("No snakemake -- DEBUG MODE")
    
    _OUTDIR = '.nb-testing-outputs'
    if not os.path.isdir(_OUTDIR):
        os.makedirs(_OUTDIR)
    
    _bin_size = 1000
    
    param_cell_line = 'K562'

    _pseudocount = 100
    _min_periods = 1
    
    input_peaklists = []
    input_stats = []
    
    for _cell_line in [param_cell_line]:
        for dataset in ['feature_accessibility', 'feature_histone', 'protein']:
            input_peaklists.append(f'../../output/final/encode/encode_{dataset}_data.{_cell_line}.bed.tsv.gz')
        
        input_stats.append(f'../../output/final/bedstats/genomic-window-matrix-stats-{_bin_size}bp_params_pc_{_pseudocount}_mp_{_min_periods}_from_bed.{_cell_line}.h5')
    

    input_marcs_interaction_data = f'../../data/raw/marcs/table-s5/table-s5.sheet.01.edges.full.tsv.gz'
    input_marcs_gene_name_map = f'../../output/interim/marcs/genes_to_marcs_from_table-s1.tsv.gz'
    
    param_correlation_method = 'kendall'
    param_input_header_separator = '__'
    param_analysis_mode = 'full'
    
    output_plot_dir = os.path.join(_OUTDIR, 'table-s5')
    

In [None]:
%load_ext autoreload
%autoreload 1

In [None]:
%aimport helpers

In [None]:
if not os.path.isdir(output_plot_dir):
    os.makedirs(output_plot_dir)

# Input

## MARCS data

First, let's load the interaction data from MARCS

In [None]:
marcs_interaction_data = pd.read_csv(input_marcs_interaction_data, sep='\t')
marcs_interaction_data.head()

We should drop the "Excluded" rows:

In [None]:
marcs_interaction_data = marcs_interaction_data[marcs_interaction_data['interaction_type'] != 'Excluded']
marcs_interaction_data.head()

We should also optimise the index 

In [None]:
marcs_interaction_data = marcs_interaction_data.set_index(
    ['Gene label (row)', 'Gene label (col)']
).sort_index()

We will also need a way to convert marcs labels to gene names:

In [None]:
marcs_gene_label_map = pd.read_csv(input_marcs_gene_name_map, sep='\t')
marcs_gene_label_map

## NGS data

At this point we need to load the information from the NGS datasets.

Unfortunately, for this particular notebook we cannot use the consolidated datasets as we need to perform different analysis. 

Due to this need to repeat some of the logic in `consolidated_bedstats_for_cell_line.ipynb`

Load the peaklists

In [None]:
CELL_LINE_COLUMN = 'Biosample term name'
_peaklists = [
    pd.read_csv(peaklist_file, sep='\t', index_col=0) for peaklist_file in input_peaklists
]

peaklist = pd.concat(_peaklists)
peaklist['Factor_Cell_Identifier'] = peaklist['Factor'].str.cat(peaklist[CELL_LINE_COLUMN], sep='-').str.cat(peaklist.index, sep='-')

peaklist['is_protein'] = peaklist['FactorType'] == 'protein'



Load the important stats.

Unlike in other notebooks of this pipeline, which use one-sided normalisation (by rows),
we will be using the harmonic average of the normalised MI coefficients as we want to know the joint influence of two proteins to each other (instead of onedirectional influence of one factor to another)

In [None]:
data_to_load = {
    # Correlation between ChIP-seq signals
    f'{param_correlation_method}_correlation': f'correlation_matrix/{param_correlation_method}',

    # Uncertainty coefficient (unlike in other datasets we need take the average here ! )
    'harmonic_avg_normalised_mi': 'uncertainty_coefficient/avg',
}

data = {k : [] for k in data_to_load}

seen_indices = set()

for filename in input_stats:
    with pd.HDFStore(filename, 'r') as store:
        for name, key in data_to_load.items():
            df = store[f'/{param_analysis_mode}/{key}']
            
            seen_indices.update(df.index)
            data[name].append(df)

data = {
    k: pd.concat(v) for k,v in data.items()
}

In [None]:
data['harmonic_avg_normalised_mi']

The previous scripts should've taken care of computing this matrix correctly, and therefore both harmonic_avg_normalised_mi and the correlation should be symmetric, we can quickly check this:

In [None]:
from numpy.random import RandomState
from numpy.testing import assert_almost_equal

random = RandomState(42)

for i in tqdm(range(10000)):
    f_a = f_b = None
    
    while f_a == f_b:
        f_a = random.choice(data[f'{param_correlation_method}_correlation'].index)
        f_b = random.choice(data[f'{param_correlation_method}_correlation'].index)
    
    assert_almost_equal(
        data['harmonic_avg_normalised_mi'].loc[f_a, f_b],
        data['harmonic_avg_normalised_mi'].loc[f_b, f_a],
        decimal=10
    )
    
    assert_almost_equal(
        data[f'{param_correlation_method}_correlation'].loc[f_a, f_b],
        data[f'{param_correlation_method}_correlation'].loc[f_b, f_a],
        decimal=10
    )
    
    

Reindex peaklist with identifiers observed in data to make our life easier of course

In [None]:
peaklist_correlation_indexed = []

for ix in seen_indices:
    if not ix.startswith('dataset'):
        continue
        
    # Basically just decompose the three values separated by ":" to get the index in peaklist
    __, peaklist_ix, agg_op = ix.split(':')
    
    if peaklist_ix in peaklist.index:
        peaklist_correlation_indexed.append([ix, peaklist_ix, agg_op])

peaklist_correlation_indexed = pd.DataFrame(peaklist_correlation_indexed, columns=['correlation_index', 'peaklist_index', 'agg_op'])
peaklist_correlation_indexed = peaklist_correlation_indexed.join(peaklist, on='peaklist_index').set_index('correlation_index')

peaklist_correlation_indexed['Factor_Cell_Identifier_op'] = peaklist_correlation_indexed['Factor_Cell_Identifier'].str.cat(peaklist_correlation_indexed['agg_op'], sep=':')
peaklist_correlation_indexed.head()

Here we are only interested in pairwise relationships between protein factors we observe in NGS:

In [None]:
protein_factors = set(peaklist_correlation_indexed[peaklist_correlation_indexed['is_protein']]['Factor'].unique())
len_protein_factors = len(protein_factors)
len_pairwise_combinations = int(len(protein_factors) * (len(protein_factors) - 1) * 0.5)

In [None]:
print(
    "Got {:,} protein factors in total, will therefore proccess {:,} pairwise combinations".format(
        len_protein_factors,
        len_pairwise_combinations
    )
)

We can precompute index lookups for each of the factors

In [None]:
marcs_gene_label_map

In [None]:
peaklist_index_lookup = {}
marcs_index_lookup = {}
for factor in protein_factors:
    
    peaklist_index_lookup[factor] = peaklist_correlation_indexed[peaklist_correlation_indexed['Factor'] == factor].index
    marcs_index_lookup[factor] = sorted(marcs_gene_label_map[marcs_gene_label_map['gene_name'] == factor]['marcs_gene_label'].unique())

At this point we just need to generate a pairwise interaction dataframe for each of the entries in our ChIP-seq dataset.

For each pairwise combination of the Factor values (factor_a, factor_b) in the `data`, we want to know:

1. The mean harmonic average between normalised mi that factor_b explains in factor_a, and factor_a explains in factor_b. The computation of this average has been taken care of in previous scripts. The stats dataframe should be symmetric so we do not need to explicitly worry about order of operations
2. Correlation between peak signals of these data, again from precomputed tables.
3. The MARCS interaction score estimate data for proteins which the factors correspond to

The code below is not optimised so have a coffee, it will take a while

In case of many-to-many mappings between factors and genes, we will be taking mean for numeric variables, and "best" case values for non-numeric data in MARCS based on the ordering below. i.e. if there are multiple classifications, one high-confidence and the other 'q <= 0.001' we will be aggregating that to "high-confidence"

In [None]:
marcs_non_numeric_ordering = {
    'Classification': ['high-confidence', 'q ≤ 0.0001', 'q ≤ 0.001','q ≤ 0.01', 'q ≤ 0.05', 'Other'],
    'interaction_type': ['In BioGRID', 'Not in BioGRID']
}

In [None]:
marcs_interaction_data['interaction_type'].unique()

In [None]:
import itertools


no_data_for = set()

_ans = []

for factor_a, factor_b in tqdm(
        itertools.combinations(protein_factors, 2), 
        total=len_pairwise_combinations
):
    
    peaklist_indices_a = peaklist_index_lookup[factor_a]
    peaklist_indices_b = peaklist_index_lookup[factor_b]
    
    # Gather the mean normalised mi and corr
    
    mean_normed_mi = helpers.nan_aware_hmean([
        data['harmonic_avg_normalised_mi'].loc[ix_aa, ix_bb]
             for ix_aa, ix_bb in itertools.product(peaklist_indices_a, peaklist_indices_b)
    ])
    
    mean_corr = np.nanmean([
        data[f'{param_correlation_method}_correlation'].loc[ix_aa, ix_bb]
             for ix_aa, ix_bb in itertools.product(peaklist_indices_a, peaklist_indices_b)
    ])
    
    # Now let's collect marcs interaction data
    
    marcs_indices_a = marcs_index_lookup[factor_a]
    marcs_indices_b = marcs_index_lookup[factor_b]
    
    marcs_subdata = []
        
    # For MARCS we need to try both combinations _a, or _b before we give up:
    for marcs_ix_aa, marcs_ix_bb in itertools.product(marcs_indices_a, marcs_indices_b):

        try:
            # Try a, b
            marcs_row = marcs_interaction_data.loc(axis=0)[marcs_ix_aa, marcs_ix_bb]
        except KeyError:
            try:
                # If we fail, try b, a
                marcs_row = marcs_interaction_data.loc(axis=0)[marcs_ix_bb, marcs_ix_aa]
            except KeyError:
                # If we fail that - give up
                continue
        
        marcs_subdata.append(marcs_row)
    
    # If we have no marcs data for this particular combination, record it and continue
    if not marcs_subdata:
        no_data_for.add((factor_a, factor_b))
        continue
    
    # If we reached this step we have all the data we need.
    marcs_subdata = pd.DataFrame(marcs_subdata)
    
    # Time to aggregate it all (in case there are multiple rows)
    # and save it into the dataframe
    
    d = {
        'factor_a': factor_a,
        'factor_b': factor_b,
        'hmean_harmonic_avg_normalised_mi': mean_normed_mi,
        f'mean_{param_correlation_method}_corr': mean_corr,
        'n_ngs': len(peaklist_indices_a) * len(peaklist_indices_b),
        'n_marcs': len(marcs_subdata),
    }
    
    for numeric_col in ['score', 'neg_log10_q']:
        d[f'marcs_{numeric_col}'] = marcs_subdata[numeric_col].mean()
    
    for non_numeric_col in ['Classification', 'interaction_type']:
        order = sorted(
            marcs_subdata[non_numeric_col], 
            key=lambda x: marcs_non_numeric_ordering[non_numeric_col].index(x)
        )
        
        d['marcs_{}'.format(non_numeric_col.lower())] = order[0]
        
    _ans.append(d)
    

aggregated_stats = pd.DataFrame(_ans)

In [None]:
aggregated_stats

We couldn't find interactions in MARCS data for these factors (should only be the Excluded ones where we don't have a gene name assigned uniquely):

In [None]:
no_data_for

Export the data:

In [None]:
aggregated_stats.to_csv(
    os.path.join(output_plot_dir, f'{param_cell_line}-interactions-vs-chip.tsv.gz'), sep='\t')

In [None]:
group_sizes = aggregated_stats.groupby(['marcs_classification', 'marcs_interaction_type']).size()
group_sizes

At this point we have all the data we need to make the plots.

We will use violin plots to illustrate the distribution, and the following statistical test to check for differences between groups:

Note that correction is applied for each subfigure separately

In [None]:
from statannotations.Annotator import Annotator
_annotator_kwargs = dict(
    test='Mann-Whitney-gt', text_format='star',
    loc='outside',
    line_width=0.5,
    comparisons_correction='bonferroni'
)

We will be testing all MARCS confidence groups against "other". First, ooling in-biogrid and non-biogrid data together (first subfigure, left), then only non-biogrid, after splitting the two.

Will only test groups that have min number of elements greater than or equal to:

In [None]:
min_in_group_for_mwu_test = 8

In [None]:
fig = plt.figure(figsize=(
    FIVE_MM_IN_INCH*10*2, 
    FIVE_MM_IN_INCH*12 # asymmetric height to allow for some room for annotations
), constrained_layout=True)

ax_together = fig.add_subplot(1, 2, 1)

_kws = dict(
    x='marcs_classification', 
    order=marcs_non_numeric_ordering['Classification'],
    y='hmean_harmonic_avg_normalised_mi',
    data=aggregated_stats
)

sns.violinplot(
    ax=ax_together,
    palette='GnBu_r',
    **_kws
)

print("---------First test (subfigure 1) -------")
pairs_together = [(c, 'Other') for c in marcs_non_numeric_ordering['Classification'] if c != 'Other' and min(group_sizes.loc[c].sum(), group_sizes.loc['Other'].sum()) >= min_in_group_for_mwu_test]
an = Annotator(ax=ax_together, pairs=pairs_together, plot='violinplot', **_kws)
an.configure(**_annotator_kwargs)
an.apply_and_annotate()

ax_split = fig.add_subplot(1, 2, 2, sharey=ax_together)

sns.violinplot(
    ax=ax_split,
    hue='marcs_interaction_type',
    hue_order=marcs_non_numeric_ordering['interaction_type'],
    palette=['#398DB6', '#CB5346'],
    **_kws
)

print("---------Second test (subfigure 2) -------")
pairs_split = [((c, 'Not in BioGRID'), ('Other', 'Not in BioGRID')) for c in marcs_non_numeric_ordering['Classification'] if c != 'Other' and min(group_sizes.loc[c, 'Not in BioGRID'], group_sizes.loc['Other', 'Not in BioGRID']) >= min_in_group_for_mwu_test]
an_split = Annotator(
    ax=ax_split, 
    pairs=pairs_split, 
    plot='violinplot',  hue='marcs_interaction_type',
    hue_order=marcs_non_numeric_ordering['interaction_type'], 
    **_kws
)
an_split.configure(**_annotator_kwargs)
an_split.apply_and_annotate()


for ax in [ax_together, ax_split]:
    ax.xaxis.set_tick_params(rotation=90)
    ax.set_xlabel("Confidence in interaction (MARCS)")
    ax.set_ylabel(f"Normalised MI in\n{param_cell_line} ChIP-seq datasets")
    
plt.savefig(os.path.join(output_plot_dir, f'{param_cell_line}-interactions-vs-chip-mi.pdf'), bbox_inches='tight')
    

Let's make one more plot for correlation:

In [None]:
fig = plt.figure(figsize=(
    FIVE_MM_IN_INCH*10*2, 
    FIVE_MM_IN_INCH*12 # asymmetric height to allow for some room for annotations
), constrained_layout=True)

ax_together = fig.add_subplot(1, 2, 1)

_kws = dict(
    x='marcs_classification', 
    order=marcs_non_numeric_ordering['Classification'],
    y=f'mean_{param_correlation_method}_corr',
    data=aggregated_stats
)

sns.violinplot(
    ax=ax_together,
    palette='GnBu_r',
    **_kws
)

print("---------First test (subfigure 1) -------")
pairs_together = [(c, 'Other') for c in marcs_non_numeric_ordering['Classification'] if c != 'Other']
an = Annotator(ax=ax_together, pairs=pairs_together, plot='violinplot', **_kws)
an.configure(**_annotator_kwargs)
an.apply_and_annotate()

ax_split = fig.add_subplot(1, 2, 2, sharey=ax_together)

sns.violinplot(
    ax=ax_split,
    hue='marcs_interaction_type',
    hue_order=marcs_non_numeric_ordering['interaction_type'],
    palette=['#398DB6', '#CB5346'],
    **_kws
)

print("---------Second test (subfigure 2) -------")
pairs_split = [((c, 'Not in BioGRID'), ('Other', 'Not in BioGRID')) for c in marcs_non_numeric_ordering['Classification'] if c != 'Other']
an_split = Annotator(
    ax=ax_split, 
    pairs=pairs_split, 
    plot='violinplot',  hue='marcs_interaction_type',
    hue_order=marcs_non_numeric_ordering['interaction_type'], 
    **_kws
)
an_split.configure(**_annotator_kwargs)
an_split.apply_and_annotate()


for ax in [ax_together, ax_split]:
    ax.xaxis.set_tick_params(rotation=90)
    ax.set_xlabel("Confidence in interaction (MARCS)")
    ax.set_ylabel(f"Correlation ({param_correlation_method}) in\n{param_cell_line} ChIP-seq datasets")
    
plt.savefig(os.path.join(output_plot_dir, f'{param_cell_line}-interactions-vs-chip-corr.pdf'), bbox_inches='tight')
    

And that should be it !