In [None]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
# %matplotlib widget

import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
sns.set_palette(['#1E1E1E', '#BB3524', '#F5D54A', '#384827', '#282F44'])
sns.set_context('paper')
sns.set_style({'axes.axisbelow': True, 
               'axes.edgecolor': '.15',
               'axes.facecolor': 'white',
               'axes.grid': False, 
               'axes.labelcolor': '.15', 
               'figure.facecolor': 'white', 
               'grid.color': '.15',
               'grid.linestyle': ':', 
               'grid.alpha': .5, 
               'image.cmap': 'Greys', 
               'legend.frameon': False, 
               'legend.numpoints': 1, 
               'legend.scatterpoints': 1,
               'lines.solid_capstyle': 'butt', 
               'axes.spines.right': False, 
               'axes.spines.top': False,  
               'text.color': '.15',  
               'xtick.top': False, 
               'ytick.right': False, 
               'xtick.color': '.15',
               'xtick.direction': 'out', 
               'ytick.color': '.15', 
               'ytick.direction': 'out', 
              })


import matplotlib

FONT_SIZE_PT = 5
matplotlib.rcParams['font.family'] = 'Arial'
matplotlib.rcParams['font.size'] = FONT_SIZE_PT
matplotlib.rcParams['axes.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['axes.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['figure.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['xtick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['ytick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.fontsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.title_fontsize'] = FONT_SIZE_PT

matplotlib.rcParams['xtick.major.size'] = matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = matplotlib.rcParams['ytick.major.width'] = 0.5


matplotlib.rcParams['xtick.minor.size'] = matplotlib.rcParams['ytick.minor.size'] = 1

matplotlib.rcParams['xtick.minor.width'] = matplotlib.rcParams['ytick.minor.width'] = 0.5

matplotlib.rcParams['axes.linewidth'] = 0.5
matplotlib.rcParams['lines.linewidth'] = 0.5
matplotlib.rcParams['grid.linewidth'] = 0.25
matplotlib.rcParams['patch.linewidth'] = 0.25
matplotlib.rcParams['lines.markeredgewidth'] = 0.25
matplotlib.rcParams['lines.markersize'] = 2

FIVE_MM_IN_INCH = 0.19685
DPI = 600
matplotlib.rcParams['figure.figsize'] = (10 * FIVE_MM_IN_INCH, 9 * FIVE_MM_IN_INCH)
matplotlib.rcParams['savefig.dpi'] = DPI
matplotlib.rcParams['figure.dpi'] = DPI // 4


#http://phyletica.org/matplotlib-fonts/
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

pd.set_option("display.max_columns", 200)

In [None]:
import palettable

In [None]:
HAVE_SNAKEMAKE = 'snakemake' in locals()

if HAVE_SNAKEMAKE:
    input_peaklists = snakemake.input.peaklists
    input_stats = snakemake.input.stats
    
    input_marcs_features = snakemake.input.marcs_features
    input_marcs_gene_label_map = snakemake.input.marcs_gene_label_map
    
    param_op = snakemake.params.agg_op_of_interest
    
    output_csv = snakemake.output.csv
    output_xlsx = snakemake.output.xlsx

    param_analysis_mode = snakemake.params['analysis_mode']
    param_correlation_method = snakemake.params['correlation_method']
    param_cell_line = snakemake.params['cell_line']
    param_bin_size = snakemake.params['bin_size']
    
    param_marcs_gene_label_separator = snakemake.params['marcs_gene_label_separator']
    param_output_header_sep = snakemake.params.get('output_header_sep', '__')
    
else:
    print("No snakemake -- DEBUG MODE")
    
    _OUTDIR = '.nb-testing-outputs'
    if not os.path.isdir(_OUTDIR):
        os.makedirs(_OUTDIR)
   
    param_bin_size = 1000
    input_peaklists = []
    input_stats = []
    
    param_cell_line = 'K562'
    
    _bin_size = 1000
    _pseudocount = 100
    _min_periods = 1
    
    for _cell_line in [param_cell_line]:
        for dataset in ['feature_accessibility', 'feature_histone', 'protein']:
            input_peaklists.append(f'../../output/final/encode/encode_{dataset}_data.{_cell_line}.bed.tsv.gz')
        
        input_stats.append(f'../../output/final/bedstats/genomic-window-matrix-stats-{param_bin_size}bp_params_pc_{_pseudocount}_mp_{_min_periods}_from_bed.{_cell_line}.h5')
    
    input_marcs_features = '../../output/interim/marcs/table-s3.long.tsv.gz'
    input_marcs_gene_label_map = '../../output/interim/marcs/genes_to_marcs_from_table-s1.tsv.gz'

    output_xlsx = os.path.join(_OUTDIR, f'bedstats_consolidated_{param_cell_line}.xlsx')
    output_csv = os.path.join(_OUTDIR, f'bedstats_consolidated_{param_cell_line}.csv')
    
    param_op = 'max'
    
    param_analysis_mode = 'full'
    param_correlation_method = 'kendall'
    
    param_marcs_gene_label_separator = '||'
    
    param_output_header_sep = '__'
    

The goal of this notebook is to consolidate the various ChIP-seq statistics into something that can be readily used to make summary plots.


# Input

Load the required data from HDF5 stores.

## Peaklist

First, the peaklist

In [None]:
CELL_LINE_COLUMN = 'Biosample term name'
_peaklists = [
    pd.read_csv(peaklist_file, sep='\t', index_col=0) for peaklist_file in input_peaklists
]

peaklist = pd.concat(_peaklists)
peaklist['Factor_Cell_Identifier'] = peaklist['Factor'].str.cat(peaklist[CELL_LINE_COLUMN], sep='-').str.cat(peaklist.index, sep='-')

peaklist['is_protein'] = peaklist['FactorType'] == 'protein'



In [None]:
peaklist['FactorType'].value_counts()

In [None]:
peaklist.head()

## Stats

Load the most important information that we will need to make the report

In [None]:
data_to_load = {
    # Correlation between ChIP-seq signals
    f'{param_correlation_method}_correlation': f'correlation_matrix/{param_correlation_method}',
    
    # Mutual information 
    'mi': f'mutual_information',
    
    # Marginal entropy
    'entropy': f'entropy/marginal',
    
    # Uncertainty coefficient (by rows)
    'normalised_mi': 'uncertainty_coefficient/by_rows',
    
    # Joint counts (all four possible combinations)
    'counts_true_true': 'counts/joint/a:True_b:True',
    'counts_true_false': 'counts/joint/a:True_b:False',
    'counts_false_true': 'counts/joint/a:False_b:True',
    'counts_false_false': 'counts/joint/a:False_b:False',
    
}

data = {k : [] for k in data_to_load}

seen_indices = set()

for filename in input_stats:
    with pd.HDFStore(filename, 'r') as store:
        for name, key in data_to_load.items():
            df = store[f'/{param_analysis_mode}/{key}']
            
            seen_indices.update(df.index)
            data[name].append(df)

data = {
    k: pd.concat(v) for k,v in data.items()
}

Reindex the peaklist with identifiers in data to make our life easier

In [None]:
peaklist_correlation_indexed = []

for ix in seen_indices:
    if not ix.startswith('dataset'):
        continue
        
    # Basically just decompose the three values separated by ":" to get the index in peaklist
    __, peaklist_ix, agg_op = ix.split(':')
    
    if peaklist_ix in peaklist.index:
        peaklist_correlation_indexed.append([ix, peaklist_ix, agg_op])

peaklist_correlation_indexed = pd.DataFrame(peaklist_correlation_indexed, columns=['correlation_index', 'peaklist_index', 'agg_op'])
peaklist_correlation_indexed = peaklist_correlation_indexed.join(peaklist, on='peaklist_index').set_index('correlation_index')

peaklist_correlation_indexed['Factor_Cell_Identifier_op'] = peaklist_correlation_indexed['Factor_Cell_Identifier'].str.cat(peaklist_correlation_indexed['agg_op'], sep=':')
peaklist_correlation_indexed.head()

And now reindex everything using `Factor_Cell_Identifier` to make it even easier

In [None]:
assert not peaklist_correlation_indexed['Factor_Cell_Identifier'].duplicated().any()

In [None]:
for k, df in data.items():
        
    new_index = []
    for ix in df.index:
        
        try:
            new_index.append(peaklist_correlation_indexed.loc[ix, 'Factor_Cell_Identifier'])
        except KeyError:
            new_index.append(ix)
    
    data[k].index = new_index
    
    if isinstance(df, pd.DataFrame):
        
        new_columns = []
        for ix in df.columns:
            try:
                new_columns.append(peaklist_correlation_indexed.loc[ix, 'Factor_Cell_Identifier'])
            except KeyError:
                new_columns.append(ix)
                
        data[k].columns = new_columns
        
    

In [None]:
peaklist_correlation_indexed = peaklist_correlation_indexed.set_index('Factor_Cell_Identifier')

## MARCS Features

Figure out which MARCS gene labels we will need to map ChIP-seq to features

In [None]:
import re

peaklist_to_marcs_map = peaklist_correlation_indexed['marcs_gene_label'].dropna().str.split(re.escape(param_marcs_gene_label_separator), expand=True).stack().reset_index()
peaklist_to_marcs_map.columns = ['correlation_index', 'no', 'marcs_gene_label']
peaklist_to_marcs_map = peaklist_to_marcs_map.drop(columns='no')
peaklist_to_marcs_map = peaklist_to_marcs_map.drop_duplicates()
peaklist_to_marcs_map

In [None]:
peaklist_to_marcs_map.groupby('correlation_index')['marcs_gene_label'].nunique().value_counts()

Now let's reindex the MARCS feature dataframe by `correlation_index` column, not `Gene label`.
In case where one Factor maps to multiple estimates of feature effect, use the row with the lowest P-value.

In [None]:
marcs_features_long = pd.read_csv(input_marcs_features, sep='\t').set_index(['Gene label', 'Feature'])
marcs_features_long.head()

In [None]:
import helpers

In [None]:
marcs_feature_list = helpers.MARCS_FEATURE_ORDER
assert set(marcs_feature_list) == set(marcs_features_long.reset_index()['Feature'].unique())

_index_df = peaklist_to_marcs_map
_column_to_reindex_to = 'correlation_index'

marcs_features_reindexed = {}

for feature in marcs_feature_list:
    feature_df = marcs_features_long.xs(feature, level='Feature')
    
    _df = []
    
    # Solve multimappings where one gene name maps to multiple MARCS gene names,
    # by taking the entry with lowest p-value
    for __, _subdata in _index_df.join(feature_df, on='marcs_gene_label').dropna(subset=['P value']).groupby(_column_to_reindex_to):
        _subdata = _subdata.loc[_subdata['P value'].idxmin()]
        _df.append(_subdata)
        
    _df = pd.DataFrame(_df).set_index(_column_to_reindex_to)
    assert not _df.index.duplicated().any()
    
    marcs_features_reindexed[feature] = _df
    
marcs_features_reindexed_wide = pd.concat(marcs_features_reindexed.values(), keys=marcs_features_reindexed.keys(), axis=1)
marcs_features_reindexed_wide = marcs_features_reindexed_wide.swaplevel(axis=1)
marcs_features_reindexed_wide.sort_index(axis=1, inplace=True)


In [None]:
marcs_features_reindexed_wide.head()

# Output

## Prep
At this point we have most of the data ready, let's combine it to the format that we can save as CSV

We want protein (and other) ChIP-seqs as rows, and non-protein ChIP-seqs as columns

In [None]:
protein_indices = peaklist_correlation_indexed[peaklist_correlation_indexed['is_protein']].index

In [None]:
data_adjusted = {}
for k, v in data.items():
    # Entropy has to be dealt a bit differently as it is a series
    if k == 'entropy':
        
        # Get entropy of rows first
        df = v.copy()
        df = pd.DataFrame(df)
        df.columns = pd.MultiIndex.from_tuples(
            [('entropy_by_row', 'entropy_by_row')],
            names=['header', 'column']
        )
        data_adjusted['entropy_by_row'] = df 
        
        # Now get the entropy of columns. We only need to do that for non-protein indices
        non_protein_indices = v.index.difference(protein_indices)
        df = v.copy()
        df = df.loc[non_protein_indices]
        
        df = pd.DataFrame(
            np.broadcast_to(df, (len(v), len(non_protein_indices))), 
            index=v.index, columns=non_protein_indices
        )
        
        df.columns = pd.MultiIndex.from_tuples(
            zip(np.repeat('entropy_by_col', len(df.columns)), df.columns),
            names=['header', 'column']
        )
        
        df = df.sort_index(axis=1)
        data_adjusted['entropy_by_col'] = df 
    else:
        # For everything else we need to make it asymmetric,
        # Take only non-proteins as columns
        non_protein_indices = v.columns.difference(protein_indices)
        df = v.loc[:, non_protein_indices].copy()
        df.columns = pd.MultiIndex.from_tuples(
            zip(np.repeat(k, len(df.columns)), df.columns),
            names=['header', 'column']
        )
        
        df = df.sort_index(axis=1)
        
        data_adjusted[k] = df
        
        
# also add marcs_data as well
data_adjusted['marcs_feature_effect'] = marcs_features_reindexed_wide['Effect'].copy()
# Use strong significance category
data_adjusted['marcs_feature_significant_category'] = marcs_features_reindexed_wide['significant_category_strong'].copy()

for marcs_key in ['marcs_feature_effect', 'marcs_feature_significant_category']:
    data_adjusted[marcs_key].columns = pd.MultiIndex.from_tuples(
            zip(np.repeat(marcs_key, len(data_adjusted[marcs_key].columns)), data_adjusted[marcs_key].columns),
            names=['header', 'column']
    ) 

In [None]:
data_protein_peaklist = peaklist_correlation_indexed.loc[
    :,
    ['File accession', CELL_LINE_COLUMN, 'Factor', 'FactorType', 'marcs_gene_label']
].copy()

data_protein_peaklist.columns = pd.MultiIndex.from_tuples(
    [['metadata', 'encode_id'], ['metadata', 'cell_line'], ['metadata', 'factor'], ['metadata', 'factor_type'], ['metadata', 'marcs_gene_label']],
    name=['header', 'column']
)
data_protein_peaklist

Assemble data for output

In [None]:
data_adjusted['normalised_mi']

In [None]:
data_adjusted[f'{param_correlation_method}_correlation']

In [None]:
data_for_output = data_protein_peaklist.copy()
data_for_output = data_for_output \
    .join(data_adjusted['marcs_feature_significant_category']) \
    .join(data_adjusted['normalised_mi']) \
    .join(data_adjusted['entropy_by_row']) \
    .join(data_adjusted[f'{param_correlation_method}_correlation']) \
    .join(data_adjusted['mi']) \
    .join(data_adjusted['entropy_by_col']) \
    .join(data_adjusted['counts_true_true']) \
    .join(data_adjusted['counts_false_true']) \
    .join(data_adjusted['counts_true_false']) \
    .join(data_adjusted['counts_false_false']) \
    .sort_index()
data_for_output

In [None]:
data_for_output_squashed_header = data_for_output.copy()
data_for_output_squashed_header.columns = [param_output_header_sep.join(c) for c in data_for_output_squashed_header.columns]

data_for_output_squashed_header.to_csv(output_csv)

In [None]:
data_for_output_squashed_header.head()

# Excel output

In [None]:
import xlsxwriter
from seaborn.utils import relative_luminance

### Summary sheet (main)

In [None]:
df_excel = data_for_output_squashed_header.copy()

# Shorten MARCS categories so they don't take so much space in excel
df_excel.replace(
    {c:  {'Neither': 'N', 'Strongly recruited': 'R', 'Strongly excluded': 'E'} for c in df_excel if c.partition(param_output_header_sep)[0] == 'marcs_feature_significant_category'},
    inplace=True
)

RENAMES = {}
COLUMN_GROUPS = {
    'Metadata': 
        ([c for c in df_excel if c.partition(param_output_header_sep)[0] == 'metadata']),
    'Normalised MI': 
         ([c for c in df_excel if c.partition(param_output_header_sep)[0] == 'normalised_mi']),
    '{} correlation'.format(param_correlation_method.capitalize()): 
          ([c for c in df_excel if c.partition(param_output_header_sep)[0] == f'{param_correlation_method}_correlation']),
    'MARCS Feature Response (R: strongly recruited, E: strongly excluded, N: neither)':
        ([c for c in df_excel if c.partition(param_output_header_sep)[0] == 'marcs_feature_significant_category']),
    'MI': 
        ([c for c in df_excel if c.partition(param_output_header_sep)[0] == 'mi']),
    'Entropy (row)':
        ([c for c in df_excel if c.partition(param_output_header_sep)[0] == 'entropy_by_row']),
    'Entropy (col)':
        ([c for c in df_excel if c.partition(param_output_header_sep)[0] == 'entropy_by_col']),
    f'Number of {param_bin_size}bp bins where row and col peaks co-occur': 
        ([c for c in df_excel if c.partition(param_output_header_sep)[0] == 'counts_true_true']),
    f'Number of {param_bin_size}bp bins where row peaks occur without col peaks': 
        ([c for c in df_excel if c.partition(param_output_header_sep)[0] == 'counts_true_false']),
    f'Number of {param_bin_size}bp bins where col peaks occur without row peaks': 
        ([c for c in df_excel if c.partition(param_output_header_sep)[0] == 'counts_false_true']),
    f'Number of {param_bin_size}bp bins where neither row, nor col peaks are present': 
        ([c for c in df_excel if c.partition(param_output_header_sep)[0] == 'counts_false_false']),
    
}

RENAMES = {c: c.partition(param_output_header_sep)[2] for c in df_excel.columns}

writer = pd.ExcelWriter(output_xlsx, engine='xlsxwriter')
workbook = writer.book

bold = workbook.add_format({'bold': True})
bold_right = workbook.add_format({'bold': True, 'right': 1})

bold_rotated = workbook.add_format({'bold': True, 'rotation':90})
bold_rotated_right = workbook.add_format({'bold': True, 'rotation':90, 'right': 1})

merged_format = workbook.add_format({
    'bold': 1,
    'align': 'center',
    'valign': 'vcenter',
    'right': 1,
})

right_border = workbook.add_format({
    'right': 1,
})



sheet_name = f"summary_{param_cell_line}"

first_data_row = 2
first_data_col = 0
df_excel.to_excel(
    writer, 
    sheet_name=sheet_name, 
    startrow=first_data_row, 
    startcol=first_data_col, 
    index=False, 
    header=False
)

last_data_row = first_data_row + len(df_excel)
last_data_col = first_data_col + len(df_excel.columns)

worksheet = writer.sheets[sheet_name]


SEPARATOR_COLUMNS = {v[-1] for v in COLUMN_GROUPS.values()}

colname_to_index_map = {}
for i, col in enumerate(df_excel.columns, start=first_data_col):
    fmt_ = bold_rotated if not col in SEPARATOR_COLUMNS else bold_rotated_right
    
    worksheet.write(first_data_row-1, i, RENAMES.get(col, col), fmt_)
    colname_to_index_map[col] = i


for merged_name, col_list in COLUMN_GROUPS.items():
    _first = colname_to_index_map[col_list[0]]
    _last = colname_to_index_map[col_list[-1]]
    
    if _first == _last:
        # Cannot merge one column only
        worksheet.write(first_data_row-2, _first, merged_name, merged_format)
    else:
        worksheet.merge_range(
            first_data_row-2, colname_to_index_map[col_list[0]], 
            first_data_row-2, colname_to_index_map[col_list[-1]],
            merged_name,
            merged_format
        )

for col in SEPARATOR_COLUMNS:
    worksheet.set_column(colname_to_index_map[col], colname_to_index_map[col], cell_format=right_border)
    
for cols, width in [
    (COLUMN_GROUPS[cg], 4) for cg in ['Normalised MI', '{} correlation'.format(param_correlation_method.capitalize()), 'MARCS Feature Response (R: strongly recruited, E: strongly excluded, N: neither)']
]:
    for col in cols:
        worksheet.set_column(colname_to_index_map[col], colname_to_index_map[col], width)
        
    
color_red = '#d6604d'
color_white = '#f7f7f7'
color_blue = '#4393c3'

        
for (val, color) in [('R', color_red), ('E', color_blue)]:
    for col in COLUMN_GROUPS['MARCS Feature Response (R: strongly recruited, E: strongly excluded, N: neither)']:
        fmt_ = workbook.add_format({
            'bg_color': color,
            'font_color': "#000000" if relative_luminance(color) > .408 else "#FFFFFF"
        })

        worksheet.conditional_format(
            first_data_row, colname_to_index_map[col], 
            last_data_row, colname_to_index_map[col],
            {
                'type': 'cell',
                'criteria': 'equal to',
                'value': f'"{val}"',
                'format': fmt_,
            }
        )
    

for col in COLUMN_GROUPS['Normalised MI']:
    worksheet.conditional_format(
        first_data_row, colname_to_index_map[col], 
        last_data_row, colname_to_index_map[col],
        {
            'type': '2_color_scale',
            'min_type': 'num',
            'max_type': 'num',
            'min_value': 0,
            'max_value': 0.5,
            'max_color': '#084081',
            'min_color': '#f7fcf0',

        }
    )
    
for col in COLUMN_GROUPS['{} correlation'.format(param_correlation_method.capitalize())]:
    worksheet.conditional_format(
        first_data_row, colname_to_index_map[col], 
        last_data_row, colname_to_index_map[col],
        {
            'type': '3_color_scale',
            'min_type': 'num',
            'max_type': 'num',
            'mid_type': 'num',
            'min_value': -0.5,
            'max_value': 0.5,
            'mid_value': 0,
            'max_color': color_red,
            'mid_color': color_white,
            'min_color': color_blue,

        }
    )


data_bar_cols = []
for col_grp in [
    'Entropy (row)', 
    'MI', 
    f'Number of {param_bin_size}bp bins where row and col peaks co-occur',
    f'Number of {param_bin_size}bp bins where row peaks occur without col peaks', 
    f'Number of {param_bin_size}bp bins where col peaks occur without row peaks', 
    f'Number of {param_bin_size}bp bins where neither row, nor col peaks are present',
]:
    data_bar_cols.extend(COLUMN_GROUPS[col_grp])

for col in data_bar_cols:
    worksheet.conditional_format(
        first_data_row, colname_to_index_map[col], 
        last_data_row, colname_to_index_map[col],
        {
            'type': 'data_bar',
            'min_type': 'percentile',
            'max_type': 'percentile',
            'min_value': 1,
            'max_value': 99,

        }
    )
    
worksheet.freeze_panes(first_data_row, colname_to_index_map[COLUMN_GROUPS['Normalised MI'][0]])
worksheet.autofilter(first_data_row-1, first_data_col, last_data_row, last_data_col)
        
writer.save()
print("Done!")

In [None]:
df_excel