In [None]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
sns.set_palette(['#1E1E1E', '#BB3524', '#F5D54A', '#384827', '#282F44'])
sns.set_context('paper')
sns.set_style({'axes.axisbelow': True, 
               'axes.edgecolor': '.15',
               'axes.facecolor': 'white',
               'axes.grid': False, 
               'axes.labelcolor': '.15', 
               'figure.facecolor': 'white', 
               'grid.color': '.15',
               'grid.linestyle': ':', 
               'grid.alpha': .5, 
               'image.cmap': 'Greys', 
               'legend.frameon': False, 
               'legend.numpoints': 1, 
               'legend.scatterpoints': 1,
               'lines.solid_capstyle': 'butt', 
               'axes.spines.right': False, 
               'axes.spines.top': False,  
               'text.color': '.15',  
               'xtick.top': False, 
               'ytick.right': False, 
               'xtick.color': '.15',
               'xtick.direction': 'out', 
               'ytick.color': '.15', 
               'ytick.direction': 'out', 
              })


import matplotlib

FONT_SIZE_PT = 5
matplotlib.rcParams['font.family'] = 'Arial'
matplotlib.rcParams['font.size'] = FONT_SIZE_PT
matplotlib.rcParams['axes.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['axes.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['figure.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['xtick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['ytick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.fontsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.title_fontsize'] = FONT_SIZE_PT

matplotlib.rcParams['xtick.major.size'] = matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = matplotlib.rcParams['ytick.major.width'] = 0.5


matplotlib.rcParams['xtick.minor.size'] = matplotlib.rcParams['ytick.minor.size'] = 1

matplotlib.rcParams['xtick.minor.width'] = matplotlib.rcParams['ytick.minor.width'] = 0.5

matplotlib.rcParams['axes.linewidth'] = 0.5
matplotlib.rcParams['lines.linewidth'] = 0.5
matplotlib.rcParams['grid.linewidth'] = 0.25
matplotlib.rcParams['patch.linewidth'] = 0.25
matplotlib.rcParams['lines.markeredgewidth'] = 0.25
matplotlib.rcParams['lines.markersize'] = 2

FIVE_MM_IN_INCH = 0.19685
DPI = 600
matplotlib.rcParams['figure.figsize'] = (10 * FIVE_MM_IN_INCH, 9 * FIVE_MM_IN_INCH)
matplotlib.rcParams['savefig.dpi'] = DPI
matplotlib.rcParams['figure.dpi'] = DPI // 2


#http://phyletica.org/matplotlib-fonts/
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

pd.set_option("display.max_columns", 200)

In [None]:
HAVE_SNAKEMAKE = 'snakemake' in locals()

if HAVE_SNAKEMAKE:
    input_marcs_gene_label_map = snakemake.input.marcs_gene_label_map 
    input_encode_metadata = snakemake.input.encode_metadata 
   
    param_encode_download_dir = str(snakemake.params.encode_download_dir)
    
    output_tsvs = {
        'protein': snakemake.output.protein,
        'feature_histone': snakemake.output.feature_histone,
        'feature_accessibility': snakemake.output.feature_accessibility,
    }
    
    param_encode_data_type = snakemake.params.encode_data_type
    param_cell_line = snakemake.params.cell_line
    
    param_gene_label_separator = snakemake.params.gene_label_separator
    
else:
    print("No snakemake -- DEBUG MODE")
    
    _OUTDIR = '.nb-testing-outputs'
    if not os.path.isdir(_OUTDIR):
        os.makedirs(_OUTDIR)
    
    input_marcs_gene_label_map = '../../../output/interim/marcs/genes_to_marcs_from_table-s1.tsv.gz'
    input_encode_metadata = '../../../data/raw/encode/encode_metadata.2021-11-05.tsv.gz'
    
    param_encode_data_type = 'bigWig'
    param_encode_download_dir = '../../../output/interim/encode/downloaded_datasets'
    
    output_tsvs = {}
    
    for k in ['protein', 'feature_histone', 'feature_accessibility']:
        output_tsvs[k] = os.path.join(_OUTDIR, f'output.encode_{k}.tsv.gz')

    param_cell_line = 'HepG2'
    
    param_gene_label_separator = '||'
        

In [None]:
for variable_ in [
    'input_marcs_gene_label_map',
    'input_encode_metadata',
    'param_encode_download_dir',
    'output_tsvs',
    'param_cell_line'
]:
    value_ = locals()[variable_]
    
    if isinstance(value_, dict):
        iter_ = [(f'{variable_}[{k}]', v) for k,v in value_.items()]
    elif isinstance(value_, list):
        iter_ = [(variable_, v) for v in value_]
    else:
        iter_ = [(variable_, value_)]
    
    for var_, val_ in iter_:
        print(f'- {var_} = {val_!r}')

        if variable_.startswith('input'):
            exists = os.path.isfile(val_)
            print(f'  {var_} exists = {exists}')
            if not exists:
                raise Exception(f"File described by {var_} does not exist")
        elif var_.startswith('output'):

            writable = os.access(os.path.dirname(val_), os.W_OK)
            print(f'  {var_} writable = {writable}')
            if not writable:
                raise Exception(f"Directory for {var_} is not writable")

# Load Encode

We first load the encode metadata dataset

In [None]:
data_encode = pd.read_csv(input_encode_metadata, sep='\t')
data_encode.index = data_encode['File accession']
data_encode.index.name = 'Identifier'

data_encode['Source'] = 'encode'

In [None]:
data_encode['Assay'].value_counts()

Filter out data that is not useful:

In [None]:
keep = data_encode['File Status'] == 'released'
keep &= data_encode['File analysis status'] == 'released'
keep &= data_encode['File assembly'] == 'GRCh38'

keep &= data_encode['Assay'].isin(['TF ChIP-seq', 'Histone ChIP-seq', 'DNase-seq', 'ATAC-seq'])


if param_encode_data_type == 'bigWig':
    keep &= data_encode['File type'] == 'bigWig'
elif param_encode_data_type == 'bed':
    keep &= data_encode['File type'] == 'bed'
else:
    raise ValueError(f"Unsupported data type: {param_encode_data_type=}")

data_encode = data_encode[keep]

In [None]:
data_encode['File type'].value_counts()

This does a good job in subsetting data but we are still left a few cases where we still have multiple, competing output files. Let's identify such cases:

In [None]:
EXPECTED_NUMBER_OF_FILES = {
    'TF ChIP-seq': 1,
    'Histone ChIP-seq': 1,
    'DNase-seq': 1,
    'ATAC-seq': 1,
}

EXPECTED_NUMBER_OF_FILES = pd.Series(EXPECTED_NUMBER_OF_FILES, name='expected_number_of_files')

In [None]:
_files_per_experiment = data_encode.groupby('Experiment accession').size()
_files_per_experiment.name = 'number_of_files'
_assay_lookup = data_encode[['Experiment accession', 'Assay']].drop_duplicates().set_index('Experiment accession')
assert not _assay_lookup.index.duplicated().any()
_files_per_experiment = pd.DataFrame(_files_per_experiment).join(_assay_lookup).join(EXPECTED_NUMBER_OF_FILES, on='Assay')
_files_per_experiment['wrong_number_of_files'] = _files_per_experiment['expected_number_of_files'] != _files_per_experiment['number_of_files']
_files_per_experiment

The `True` number below should be very low. If it is not, it likely means that you forgot `&files.preferred_default=true` in your metadata query...

In [None]:
_files_per_experiment['wrong_number_of_files'].value_counts()

Chances are that the few duplicates we have there, are there due to different version of analysis:

In [None]:
data_encode[data_encode['Experiment accession'].isin(_files_per_experiment[_files_per_experiment['wrong_number_of_files']].index)].sort_values(by=['Experiment accession', 'File analysis title']).head()



In [None]:
data_analysis_version_priority_order = [
 'ENCODE4 v3.0.0 GRCh38',
 'ENCODE4 v3.0.0-alpha.2 GRCh38',
 'ENCODE4 v1.10.0 GRCh38',
 'ENCODE4 v1.9.2 GRCh38',
 'ENCODE4 v1.9.1 GRCh38',
 'ENCODE4 v1.9.0 GRCh38',
 'ENCODE4 v1.8.1 GRCh38',
 'ENCODE4 v1.8.0 GRCh38',
 'ENCODE4 v1.7.1 GRCh38',
 'ENCODE4 v1.7.0 GRCh38',
 'ENCODE4 v1.6.1 GRCh38',
 'ENCODE4 v1.6.0 GRCh38',
 'ENCODE4 v1.5.1 GRCh38',
 'ENCODE4 v1.5.0 GRCh38',
 'ENCODE4 v1.4.0 GRCh38',
 'ENCODE4 v1.1.6 GRCh38',
 'ENCODE4 v1.1.5 GRCh38',
 'ENCODE4 GRCh38',
 'ENCODE3 GRCh38'
]

assert all(v in data_analysis_version_priority_order for v in data_encode['File analysis title'].unique())

So keep only the latest version for each experiment...

In [None]:
_clean_encode = []

for experiment, subdata in data_encode.groupby('Experiment accession'):
    
    version_priority = subdata['File analysis title'].apply(data_analysis_version_priority_order.index)
    best_version = version_priority.min()
    
    _clean_encode.append(subdata[version_priority == best_version])
    
_clean_encode = pd.concat(_clean_encode)

data_encode = _clean_encode

This should eliminate most duplicates:

In [None]:
_files_per_experiment = data_encode.groupby('Experiment accession').size()
_files_per_experiment.name = 'number_of_files'
_assay_lookup = data_encode[['Experiment accession', 'Assay']].drop_duplicates().set_index('Experiment accession')
assert not _assay_lookup.index.duplicated().any()
_files_per_experiment = pd.DataFrame(_files_per_experiment).join(_assay_lookup).join(EXPECTED_NUMBER_OF_FILES, on='Assay')
_files_per_experiment['wrong_number_of_files'] = _files_per_experiment['expected_number_of_files'] != _files_per_experiment['number_of_files']
_files_per_experiment['wrong_number_of_files'].value_counts()

If there are any multi-filename experiments left, just give up

In [None]:
to_remove = frozenset(_files_per_experiment[_files_per_experiment['wrong_number_of_files']].index)
if to_remove:
    print("Removing {:,} experiments because we couldn't figure out which peakset to use:".format(len(to_remove)))
    print(to_remove)
    
data_encode = data_encode[~data_encode['Experiment accession'].isin(to_remove)]

At this point we have a more or less clean encode metadata set:

In [None]:
data_encode.head()

Now clean up the target information into column `Factor`

In [None]:
data_encode['Experiment target'].value_counts()

In [None]:
data_encode['Factor'] = data_encode['Experiment target'].str.rpartition('-')[0]
data_encode['Factor'].value_counts().head(20)

We cannot use the experiment target column for other assays:

In [None]:
data_encode[data_encode['Factor'].isnull()]['Assay'].value_counts()

Instead use assay name:

In [None]:
data_encode.loc[data_encode['Factor'].isnull(), 'Factor'] = data_encode.loc[data_encode['Factor'].isnull(), 'Assay']

In [None]:
data_encode['Factor'].value_counts()

Also assign factor types based on assay:

In [None]:
data_encode['FactorType'] = None
data_encode.loc[data_encode['Assay'] == 'TF ChIP-seq', 'FactorType'] = 'protein'
data_encode.loc[data_encode['Assay'] == 'Histone ChIP-seq', 'FactorType'] = 'feature_histone'

data_encode.loc[data_encode['Assay'].isin(['DNase-seq', 'ATAC-seq']), 'FactorType'] = 'feature_accessibility'


In [None]:
data_encode.loc[data_encode['FactorType'] == 'protein', 'Factor'].value_counts().head(20)

In [None]:
data_encode.loc[data_encode['FactorType'] == 'feature_histone', 'Factor'].value_counts().head(20)

Join cell type columns into one

In [None]:
CELL_TYPE_COLUMN = 'Cell_full_type'
data_encode[CELL_TYPE_COLUMN] = data_encode[['Biosample type', 'Biosample term name']].apply(lambda x: '|'.join(map(str, x)), axis=1)
data_encode[CELL_TYPE_COLUMN].value_counts().head(20)

In [None]:
data_encode.groupby(['Assay', 'Output type']).size().sort_values(ascending=False)

Finally propose the filename download locations for each of the files (even if we won't end up using them)

In [None]:
import re

def get_ext(path):
    
    base_path, ext = os.path.splitext(path)
    if ext == '.gz':
        base_path, intermediate_ext = os.path.splitext(base_path)
        return intermediate_ext + ext
    else:
        return ext

def get_filename(row):
    
    safe_cell_type_col = re.sub('[^a-zA-Z0-9]+', '_', row[CELL_TYPE_COLUMN])
    safe_factor_col = re.sub('[^a-zA-Z0-9]+', '_', row['Factor'])
    safe_id = re.sub('[^a-zA-Z0-9]+', '_', row.name)
    ext = get_ext(row['File download URL'])
    dataset = row['FactorType']
    return os.path.join(param_encode_download_dir, f'encode_{dataset}', f'{safe_factor_col}.{safe_cell_type_col}.encode-{safe_id}{ext}')
data_encode['Filename'] = data_encode.apply(get_filename, axis=1)

In [None]:
data_encode

# Encode Proteins dataset: linking with MARCS

We will now create a map between ENCODE data and MARCS, we load the precomputed mapping between gene identifiers and MARCS labels:

In [None]:
data_marcs = pd.read_csv(input_marcs_gene_label_map, sep='\t')
data_marcs.head()

We will match datasets to MARCS based on lowercase gene names

In [None]:
data_marcs['gene_name_lowercase'] = data_marcs['gene_name'].str.lower()
data_marcs.head()

MARCS identifiers will be joined with a double bar: `||`

In [None]:
marcs_lookup = data_marcs.groupby('gene_name_lowercase').agg({'marcs_gene_label': param_gene_label_separator.join})['marcs_gene_label']

In [None]:
marcs_lookup[marcs_lookup.str.contains("\|\|")]

Now we want to filter out proteins in ENCODE that match to MARCS

In [None]:
data_encode_proteins = data_encode.query("FactorType == 'protein'")
data_encode_proteins.head()

In [None]:
data_encode_proteins['Factor_lowercase'] = data_encode_proteins['Factor'].str.lower()

encode_proteins_all = set(data_encode_proteins['Factor_lowercase'].unique())
encode_proteins_in_marcs = encode_proteins_all & set(marcs_lookup.index)

print("{:,}/{:,} ({:.2%}) of unique ENCODE protein names can be matched to MARCS data".format(
    len(encode_proteins_in_marcs), len(encode_proteins_all), len(encode_proteins_in_marcs)/len(encode_proteins_all)
))

marcs_gene_labels_with_encode_ids = marcs_lookup.loc[encode_proteins_in_marcs].str.split('\|\|', expand=True).stack().unique()
print("This corresponds to {:,} unique gene labels in MARCS ".format(len(marcs_gene_labels_with_encode_ids)))

Leave only data with MARCS labels:

In [None]:
data_encode_proteins = pd.merge(data_encode_proteins, marcs_lookup, left_on='Factor_lowercase', right_index=True, how='inner')
data_encode_proteins = data_encode_proteins.drop(columns='Factor_lowercase')
data_encode_proteins.head()

Break down the results by cell line:

In [None]:
cell_line_counts = data_encode_proteins.groupby(CELL_TYPE_COLUMN)['Factor'].nunique()
cell_line_counts.sort_values(ascending=False).head(10)

In [None]:
cell_lines_to_keep = {f'cell line|{param_cell_line}'}
print('Keeping only the data from:', ', '.join(cell_lines_to_keep))

In [None]:
data_encode_proteins = data_encode_proteins[data_encode_proteins[CELL_TYPE_COLUMN].isin(cell_lines_to_keep)]

This leaves this many unique factors:

In [None]:
data_encode_proteins['Factor'].nunique()

Representing this many MARCS IDs:

In [None]:
data_encode_proteins['marcs_gene_label'].str.split('\|\|', expand=True).stack().nunique()

Let's make a heatmap

In [None]:
matrix = data_encode_proteins.groupby(['marcs_gene_label', CELL_TYPE_COLUMN]).size()
matrix = matrix.unstack(CELL_TYPE_COLUMN)

_cmap = sns.clustermap(
   matrix.fillna(0), mask=matrix.isnull(), 
    row_cluster=len(matrix.columns) > 1,
    col_cluster=len(matrix.columns) > 1,
    metric='cosine', method='complete', 
    annot=matrix,
    fmt='.0f',
   linewidth=1,
   cmap='GnBu',
   figsize=(FIVE_MM_IN_INCH*1.8*(len(matrix.columns)), FIVE_MM_IN_INCH*(len(matrix)) * 0.6),
   yticklabels=1,
)
_cmap.cax.set_ylabel("Number of datasets")
_cmap.ax_heatmap.xaxis.set_tick_params(length=0)
_cmap.ax_heatmap.yaxis.set_tick_params(length=0)

# _cmap.savefig(os.path.join(output_plots_dir, 'n_encode_datasets_per_cell_line_per_marcs_label.pdf'))

# Encode feature data

Unlike protein datasets, we only need to filter the faetures by cell type

In [None]:
data_features = {}
for feature in ['feature_histone', 'feature_accessibility']:
    data_feature = data_encode.query("FactorType == @feature")
    data_feature = data_feature[data_feature[CELL_TYPE_COLUMN].isin(cell_lines_to_keep)]

    data_features[feature] = data_feature

In [None]:
data_features['feature_histone'].groupby('Factor').size().sort_values(ascending=False).head(10)

In [None]:
data_features['feature_accessibility'].groupby('Factor').size().sort_values(ascending=False).head(10)

# Output

At this point we're done so what's only left is to save the output files

In [None]:
data_encode_proteins.sort_index().to_csv(output_tsvs['protein'], sep='\t')
for feature, df in data_features.items():
    df.sort_index().to_csv(output_tsvs[feature], sep='\t')