In [None]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
sns.set_palette('Dark2')
sns.set_context('paper')
sns.set_style({'axes.axisbelow': True, 
               'axes.edgecolor': '.15',
               'axes.facecolor': 'white',
               'axes.grid': True, 
               'axes.labelcolor': '.15', 
               'figure.facecolor': 'white', 
               'grid.color': '.15',
               'grid.linestyle': ':', 
               'grid.alpha': .5, 
               'image.cmap': 'Greys', 
               'legend.frameon': False, 
               'legend.numpoints': 1, 
               'legend.scatterpoints': 1,
               'lines.solid_capstyle': 'butt', 
               'axes.spines.right': False, 
               'axes.spines.top': False,  
               'text.color': '.15',  
               'xtick.top': False, 
               'ytick.right': False, 
               'xtick.color': '.15',
               'xtick.direction': 'out', 
               'ytick.color': '.15', 
               'ytick.direction': 'out', 
              })


import matplotlib

FONT_SIZE_PT = 5
matplotlib.rcParams['font.family'] = 'Arial'
matplotlib.rcParams['font.size'] = FONT_SIZE_PT
matplotlib.rcParams['axes.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['axes.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['figure.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['xtick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['ytick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.fontsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.title_fontsize'] = FONT_SIZE_PT

matplotlib.rcParams['xtick.major.size'] = matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = matplotlib.rcParams['ytick.major.width'] = 0.5


matplotlib.rcParams['xtick.minor.size'] = matplotlib.rcParams['ytick.minor.size'] = 1

matplotlib.rcParams['xtick.minor.width'] = matplotlib.rcParams['ytick.minor.width'] = 0.5

matplotlib.rcParams['axes.linewidth'] = 0.5
matplotlib.rcParams['lines.linewidth'] = 0.5
matplotlib.rcParams['grid.linewidth'] = 0.25
matplotlib.rcParams['patch.linewidth'] = 0.25
matplotlib.rcParams['lines.markeredgewidth'] = 0.25
matplotlib.rcParams['lines.markersize'] = 2

FIVE_MM_IN_INCH = 0.19685
DPI = 600
matplotlib.rcParams['figure.figsize'] = (10 * FIVE_MM_IN_INCH, 9 * FIVE_MM_IN_INCH)
matplotlib.rcParams['savefig.dpi'] = DPI
matplotlib.rcParams['figure.dpi'] = DPI // 4


#http://phyletica.org/matplotlib-fonts/
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# (02) MARCS identifier linking

This notebook serves to map the metadata from ChIP-MS to MARCS gene identifiers.

## Configuration

In [None]:
import pathlib

INPUT_CHIP_MS_METADATA = pathlib.Path('outputs') / '01-extracting' / 'data_metadata.csv'
assert INPUT_CHIP_MS_METADATA.is_file()

INPUT_MARCS_GENE_LIST_MAP = pathlib.Path('data/from-marcs/table-s2.sheet.02.list_of_proteins.tsv.gz')
assert INPUT_MARCS_GENE_LIST_MAP.is_file()
    
INPUT_MARCS_TABLE_S3 = pathlib.Path('data/from-marcs-chipseq-analysis/table-s3.long.tsv.gz')
assert INPUT_MARCS_TABLE_S3.is_file()

In [None]:
import pathlib
OUTPUT_DIRECTORY = pathlib.Path('outputs') / '02-linking-to-MARCS'

if not OUTPUT_DIRECTORY.is_dir():
    OUTPUT_DIRECTORY.mkdir(parents=True)

## Reading datasets

We only need the metadata from ChIP MS, as we will be matching based on the Gene label column

In [None]:
data_metadata_chip_ms = pd.read_csv(INPUT_CHIP_MS_METADATA, index_col=0)
data_metadata_chip_ms

We now load the MARCS data in order to create a mapping between MARCS gene labels and gene names

In [None]:
data_marcs_gene_list = pd.read_csv(INPUT_MARCS_GENE_LIST_MAP, sep='\t', index_col=0)
data_marcs_gene_list

Let's create a map between MARCS Gene Labels and Majority Protein IDs

In [None]:
data_marcs_uniprot_id_map = data_marcs_gene_list['Majority protein IDs'].str.split(';', expand=True).stack().reset_index()
data_marcs_uniprot_id_map.columns = ['marcs_gene_label', 'ix', 'uniprot_id']
data_marcs_uniprot_id_map = data_marcs_uniprot_id_map[['marcs_gene_label', 'uniprot_id']].drop_duplicates()
data_marcs_uniprot_id_map

And another one between MARCS gene labels and gene names

In [None]:
data_marcs_gene_name_map = data_marcs_gene_list['Gene names'].dropna().str.split(';', expand=True).stack().reset_index()
data_marcs_gene_name_map.columns = ['marcs_gene_label', 'ix', 'gene_name']
data_marcs_gene_name_map = data_marcs_gene_name_map[['marcs_gene_label', 'gene_name']]
data_marcs_gene_name_map['gene_name_lowercase'] = data_marcs_gene_name_map['gene_name'].str.lower()
data_marcs_gene_name_map

## Mapping ChIP-MS labels to MARCS 

### via uniprot IDs

We start by mapping the ChIP-MS proteins to MARCS by their uniprot accession number.

Unlike marcs data the accession column in ChIP-MS contains only one protein id.

We can verify it by checking the maximum length of the strings in this column:

In [None]:
data_metadata_chip_ms['Accession'].str.len().max()

The maximum length columns are just proteins with long accessions:

In [None]:
data_metadata_chip_ms[data_metadata_chip_ms['Accession'].str.len() == 10]

We therefore create the following data frame for ChIP-MS mappings:

In [None]:
data_chip_ms_accession_gene_map = data_metadata_chip_ms[['Accession', 'Gene']].reset_index()
data_chip_ms_accession_gene_map.columns = ['chip_ms_label', 'chip_ms_accession', 'chip_ms_gene']
data_chip_ms_accession_gene_map['chip_ms_gene_lowercase'] = data_chip_ms_accession_gene_map['chip_ms_gene'].str.lower()
data_chip_ms_accession_gene_map

There is some overlap between the accessions in MARCS and in metadata chip_ms

In [None]:
from matplotlib_venn import venn2

venn2((set(data_marcs_uniprot_id_map['uniprot_id']), set(data_chip_ms_accession_gene_map['chip_ms_accession'])), 
      set_labels=("In MARCS", "In ChIP-MS"))
plt.title("Overlap between UniProt identifiers")
_fname = OUTPUT_DIRECTORY / '01-venn-diagram-of-uniprot-id-overlaps.pdf'
_caption = f"""
Overlaps between Uniprot IDs in MARCS data and Uniprot IDs observed in the ChIP-MS dataset.
Uniprot IDs are required to match exactly.
"""
plt.savefig(_fname, bbox_inches='tight', dpi=DPI)
with open(str(_fname) + '.caption.md', 'w') as f:
    f.write(_caption)
    print(_caption)

In [None]:
marcs_to_chip_ms_map_by_accession = pd.merge(
    data_marcs_uniprot_id_map,
    data_chip_ms_accession_gene_map[['chip_ms_label', 'chip_ms_accession']],
    left_on='uniprot_id',
    right_on='chip_ms_accession'
)

marcs_to_chip_ms_map_by_accession['mapped_via'] = 'Accession'
marcs_to_chip_ms_map_by_accession

### via gene names

Alternatively, we can align MARCS data to ChIP-MS via the parsed gene name column.

This is a less strict mapping, and is mostly useful for when we hav edifferent variants of the same Gene in MARCS, e.g. PHF8 (1) / PHF8 (2) that may not necessarily correspond 1:1 in ChIP-MS.


We can see the quick overlap:

In [None]:
from matplotlib_venn import venn2

in_marcs = set(data_marcs_gene_name_map['gene_name_lowercase'].dropna().unique())
in_chip_ms = set(data_chip_ms_accession_gene_map['chip_ms_gene_lowercase'].dropna().unique())

venn2((in_marcs, in_chip_ms), set_labels=['Gene names in MARCS', 'Gene names in ChIP-MS'])

_fname = OUTPUT_DIRECTORY / '01-venn-diagram-of-gene-name-overlaps.pdf'
_caption = f"""
Overlaps between Gene names in MARCS data and gene names observed in the ChIP-MS dataset.
Gene names are equired to match exactly.
"""
plt.title("Overlap between Gene names")
plt.savefig(_fname, bbox_inches='tight', dpi=DPI)
with open(str(_fname) + '.caption.md', 'w') as f:
    f.write(_caption)
    print(_caption)

We can now make a similar mapping like we did with Uniprot IDs

In [None]:
marcs_to_chip_ms_map_by_gene_name = pd.merge(
    data_marcs_gene_name_map,
    data_chip_ms_accession_gene_map[['chip_ms_label', 'chip_ms_gene_lowercase']],
    left_on='gene_name_lowercase',
    right_on='chip_ms_gene_lowercase'
)

marcs_to_chip_ms_map_by_gene_name['mapped_via'] = 'Gene name'
marcs_to_chip_ms_map_by_gene_name

Let's see how much of this mapping overlaps

In [None]:
all_marcs_gene_labels = set(data_marcs_gene_list.index)
mapped_via_accession = set(marcs_to_chip_ms_map_by_accession['marcs_gene_label'].unique())
mapped_via_gene_name = set(marcs_to_chip_ms_map_by_gene_name['marcs_gene_label'].unique())

from matplotlib_venn import venn3_unweighted

venn3_unweighted(
    (all_marcs_gene_labels, mapped_via_accession, mapped_via_gene_name),
    set_labels=("MARCS gene labels", "Mappable to ChIP-MS via Accession", "Mappable to ChIP-MS via Gene name"),
)

plt.title("Mappability summary between MARCS gene labels and ChIP-MS")

_fname = OUTPUT_DIRECTORY / '01-venn-diagram-of-gene-label-mappability.pdf'
_caption = f"""
Distribution of the mappability of MARCS gene labels to the ChIP-MS datasets.
Gene names are equired to match exactly.
"""
plt.savefig(_fname, bbox_inches='tight', dpi=DPI)
with open(str(_fname) + '.caption.md', 'w') as f:
    f.write(_caption)
    print(_caption)

In [None]:
mappable_via_accession_only = mapped_via_accession - mapped_via_gene_name
mappable_via_accession_only

In [None]:
mappable_via_gene_name_only = mapped_via_gene_name - mapped_via_accession
mappable_via_gene_name_only

We can now compute the final mapping by concatenating the two dataframes:

In [None]:
full_gene_label_map = pd.concat(
    (
        marcs_to_chip_ms_map_by_accession[['marcs_gene_label', 'chip_ms_label', 'mapped_via']],
        marcs_to_chip_ms_map_by_gene_name[['marcs_gene_label', 'chip_ms_label', 'mapped_via']]
    ),
    ignore_index=True
)
full_gene_label_map

And sorting out the duplication by concatenating the 'mapped_via' column:

In [None]:
full_gene_label_map = full_gene_label_map.groupby(['marcs_gene_label', 'chip_ms_label'])['mapped_via'].apply(lambda x: ';'.join(sorted(x.unique()))).reset_index()
full_gene_label_map

These counts should be similar to the venn diagram above (but not exact, due to one-to-many mappings):

In [None]:
full_gene_label_map['mapped_via'].value_counts()

Speicfically, some ChIP MS labels map to multiple MARCS labels:

In [None]:
mapping_counts_chip_ms_to_marcs = full_gene_label_map.groupby('chip_ms_label')['marcs_gene_label'].nunique()
mapping_counts_chip_ms_to_marcs.loc[mapping_counts_chip_ms_to_marcs > 1]

And some of the MARCS labels map to multiple ChIP-MS labels.
This, unfortunately, is unavoidable.

In [None]:
mapping_counts_marcs_to_chip_ms = full_gene_label_map.groupby('marcs_gene_label')['chip_ms_label'].nunique()
mapping_counts_marcs_to_chip_ms.loc[mapping_counts_marcs_to_chip_ms > 1]

Save this mapping to CSV

In [None]:
full_gene_label_map.to_csv(OUTPUT_DIRECTORY / '02-marcs-to-chip-ms-mapping.csv')

## Linking with MARCS Features

As a final step in this notebook we will map the MARCS Feature effect data (from table S3) to ChIP-MS identifiers.

We will again make use the dataframe processed in the ChIP-Seq pipeline as it is easier to ingest.

In [None]:
marcs_table_s3 = pd.read_csv(INPUT_MARCS_TABLE_S3, sep='\t')
marcs_table_s3

Before joining, let's quickly count the number of genes for each feature and `significant_category_strong`:

In [None]:
counts_marcs_total = marcs_table_s3.groupby(['Feature', 'significant_category_strong'])['Gene label'].nunique()
counts_marcs_total

Now let's joint the table S3 with the label translation map.
Again, inner join is fine here as we only can use the matching data anyway

In [None]:
marcs_table_s3_merged = pd.merge(
    marcs_table_s3, full_gene_label_map, 
    left_on='Gene label', right_on='marcs_gene_label', 
    how='inner'
)

In [None]:
marcs_table_s3_merged.query('Feature == "H3K4me3" and significant_category_strong == "Strongly recruited"')

Let's re-count the category occupancies so we can see how much of each feature we cover

In [None]:
counts_marcs_matched = marcs_table_s3_merged.groupby(['Feature', 'significant_category_strong'])['Gene label'].nunique()
counts_marcs_matched

Let's make this into a table/figure

In [None]:
feature_matching_statistics = pd.DataFrame({
    'total': counts_marcs_total, 
    'matched_in_chip_ms': counts_marcs_matched,
}).fillna(0).astype(int)
feature_matching_statistics['fraction'] = feature_matching_statistics['matched_in_chip_ms'] / feature_matching_statistics['total']
feature_matching_statistics['annot'] = feature_matching_statistics.apply(lambda x: '{:,}/{:,} ({:.1%})'.format(int(x['matched_in_chip_ms']), int(x['total']), x['fraction']), axis=1)
feature_matching_statistics

For further cells we don't need the category "Neither":

In [None]:
feature_matching_statistics_no_neither = feature_matching_statistics.loc(axis=0)[:, ['Strongly recruited', 'Strongly excluded']].copy()

In [None]:
feature_matching_statistics_no_neither

Make a nice heatmap:

In [None]:
figure = plt.figure(figsize=(5*FIVE_MM_IN_INCH, 25*FIVE_MM_IN_INCH))

# Don't show "neither"
_df = feature_matching_statistics_no_neither.sort_values(by='matched_in_chip_ms', ascending=False)

_hmap = sns.heatmap(
    _df[['fraction']],
    cmap='viridis',
    annot=_df[['annot']],
    fmt='',
    linewidth=0.1,
    robust=True,
)

_hmap.yaxis.set_tick_params(length=0)
_hmap.xaxis.set_tick_params(length=0)
_hmap.set_xticks([])
_hmap.set_ylabel("MARCS Feature and significant category (strong)")

_fname = OUTPUT_DIRECTORY / '02-fraction-of-matched-proteins-in-marcs-categories.pdf'
_caption = """
Fraction of proteins in MARCS feature categories that were matched in ChIP-MS.

The colour highlights the fraction using the viridis colour scale on the right.
The annotated textdisplays the actual numbers of matched proteins (first number),
the total number of proteins in the category (second number, after slash), 
and the percentage of matched proteins between MARCS feature and ChIP-MS.

Heatmap is sorted by number of matched proteins, descending.
"""
plt.savefig(_fname, bbox_inches='tight', dpi=DPI)
with open(str(_fname) + '.caption.md', 'w') as f:
    f.write(_caption)
    print(_caption)

These numbers are fairly low.  We can quickly estimate how these numbers relate to what we would expect by pure chance.

If we were to assume a hypergeometric distribution with the following parameters:

In [None]:
# population size - number of all unique Gene labels in MARCS
hypergeom_N = len(data_marcs_gene_list.index)
# number of "successes" - number of unique gene labels in the mapping
hypergeom_K = full_gene_label_map['marcs_gene_label'].nunique() 

f'N={hypergeom_N}, K={hypergeom_K}'

Then for a MARCS category of size $n$ (column `total`) we would expect $E[k] = n \times K/N$ proteins to match in the dataset.

The direction and level of under-representation can be judged by computing a pearson residual, given the observed number of matched proteins in a category $k$ (column `matched_in_chip_ms`):

$$
\text{Pearson residual} = \frac{k - E[k]}{\sqrt{E[k]}}
$$

In [None]:
feature_matching_statistics_no_neither['expected_number_of_matched'] = hypergeom_K/hypergeom_N * feature_matching_statistics_no_neither['total']
feature_matching_statistics_no_neither['pearson_residual'] = (feature_matching_statistics_no_neither['matched_in_chip_ms'] - feature_matching_statistics_no_neither['expected_number_of_matched']) / feature_matching_statistics_no_neither['expected_number_of_matched'].apply(np.sqrt)

feature_matching_statistics_no_neither['annot_2'] = feature_matching_statistics_no_neither['annot'].str.cat(
    feature_matching_statistics_no_neither['expected_number_of_matched'].apply(lambda x: f'(Exp = {x:.1f})'),
    sep=' '
)

In [None]:
feature_matching_statistics_no_neither.sort_values(by='pearson_residual', ascending=False)

In [None]:
figure = plt.figure(figsize=(9*FIVE_MM_IN_INCH, 25*FIVE_MM_IN_INCH))

# Don't show "neither"
_df = feature_matching_statistics_no_neither.sort_values(by='matched_in_chip_ms', ascending=False)

_hmap = sns.heatmap(
    _df[['pearson_residual']],
    cmap='RdBu_r',
    annot=_df[['annot_2']],
    fmt='',
    linewidth=0.1,
    robust=True,
    center=0,
)

_hmap.yaxis.set_tick_params(length=0)
_hmap.xaxis.set_tick_params(length=0)
_hmap.set_xticks([])
_hmap.set_ylabel("MARCS Feature and significant category (strong)")

_fname = OUTPUT_DIRECTORY / '02-fraction-of-matched-proteins-in-marcs-categories-pearson-residuals.pdf'
_caption = """
Fraction of proteins in MARCS feature categories that were matched in ChIP-MS.
The annotated text displays the actual numbers of matched proteins (first number),
the total number of proteins in the category (second number, after slash), 
the percentage of matched proteins between MARCS feature and ChIP-MS, 
and the expected number of matched proteins (if matching was random across the whole MARCS dataset - number prefixed by Exp). 

The colour highlights the pearson residual (observed-expected/sqrt(expected)). Red categories are strongly over-represented, blue - underrepresented

Heatmap is sorted by number of matched proteins, descending.
"""
plt.savefig(_fname, bbox_inches='tight', dpi=DPI)
with open(str(_fname) + '.caption.md', 'w') as f:
    f.write(_caption)
    print(_caption)

As shown above the coverage of the categories is not ideal, but we will have to live with this.

#### Reindexing to ChIP-MS notation

We will now "collapse" the MARCS Features data into the indices of ChIP-MS experiment so it is easier to work with in subsequent notebooks. We will use the same trick as ChIP-seq analysis.

In [None]:
marcs_feature_list = list(marcs_table_s3['Feature'].unique())

In [None]:
[c for c in marcs_table_s3.columns if not c.startswith('H')]

In [None]:
# Only use the statistical columns
_merged_df = marcs_table_s3_merged.set_index(['Feature', 'Gene label'])[[
    'Effect',
    '95% CI (+/-)',
    'logFC variance',
    't statistic (moderated)',
    'P value',
    'P value (adjusted)',
    'Moderated t distribution dof',
    'Moderated t distribution stdev',
    'significant',
    'significant_category_weak',
    'significant_category_strong',
    'chip_ms_label',
]]

_column_to_reindex_to = 'chip_ms_label'

marcs_features_reindexed = {}

for feature in marcs_feature_list:
    feature_df = _merged_df.xs(feature, level='Feature')
    
    _df = []
    
    # Solve multimappings where one gene name maps to multiple MARCS gene names,
    # by taking the entry with lowest p-value
    for __, _subdata in feature_df.dropna(subset=['P value']).groupby(_column_to_reindex_to):
        _subdata = _subdata.loc[_subdata['P value'].idxmin()]
        _df.append(_subdata)
        
    _df = pd.DataFrame(_df).set_index(_column_to_reindex_to)
    assert not _df.index.duplicated().any()
    marcs_features_reindexed[feature] = _df
    
    
marcs_features_reindexed_wide = pd.concat(marcs_features_reindexed.values(), keys=marcs_features_reindexed.keys(), axis=1)
marcs_features_reindexed_wide = marcs_features_reindexed_wide.swaplevel(axis=1)
marcs_features_reindexed_wide.sort_index(axis=1, inplace=True)

In [None]:
marcs_features_reindexed_wide.columns.get_level_values(0).unique()

In [None]:
marcs_features_reindexed_wide

Quickly verify that feature category mapping is correct, that is all ChIP-MS labels marked to be in marcs category,
link to gene labels that are in this category. Additionally print out the ChIP-MS labels in each of the strong categories (for debugging reasons)

In [None]:
for marcs_feature in marcs_feature_list:
    for category in ['Neither', 'Strongly recruited', 'Strongly excluded']:
        
        available_marcs_labels = set(marcs_table_s3[(marcs_table_s3['Feature'] == marcs_feature) & (marcs_table_s3['significant_category_strong'] == category)]['Gene label'])
        
        chip_ms_labels = marcs_features_reindexed_wide[marcs_features_reindexed_wide['significant_category_strong', marcs_feature] == category].index
        
        if category != 'Neither':
            print("{} - {} n={:,}:\n{}\n".format(marcs_feature, category, len(chip_ms_labels), ', '.join(sorted(chip_ms_labels))))
        _map = full_gene_label_map[full_gene_label_map['chip_ms_label'].isin(chip_ms_labels)]
        
        for label, submap in _map.groupby('chip_ms_label'):
        
            marcs_labels = set(submap['marcs_gene_label'])
            
            assert (set(available_marcs_labels) & set(marcs_labels)) != set(), f"{label} should not be associated with {marcs_feature}: {category}"
            


Finally write output:

In [None]:
_df = marcs_features_reindexed_wide.copy()
# Take care of the multi-index
_df.columns = ['__'.join(map(str, c)) for c in _df.columns]
_df.to_csv(OUTPUT_DIRECTORY / '03-mapping-between-chip-ms-and-marcs-features.csv')