In [1]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
sns.set_palette('Dark2')
sns.set_context('paper')
sns.set_style({'axes.axisbelow': True, 
               'axes.edgecolor': '.15',
               'axes.facecolor': 'white',
               'axes.grid': True, 
               'axes.labelcolor': '.15', 
               'figure.facecolor': 'white', 
               'grid.color': '.15',
               'grid.linestyle': ':', 
               'grid.alpha': .5, 
               'image.cmap': 'Greys', 
               'legend.frameon': False, 
               'legend.numpoints': 1, 
               'legend.scatterpoints': 1,
               'lines.solid_capstyle': 'butt', 
               'axes.spines.right': False, 
               'axes.spines.top': False,  
               'text.color': '.15',  
               'xtick.top': False, 
               'ytick.right': False, 
               'xtick.color': '.15',
               'xtick.direction': 'out', 
               'ytick.color': '.15', 
               'ytick.direction': 'out', 
              })


import matplotlib

FONT_SIZE_PT = 5
matplotlib.rcParams['font.family'] = 'Arial'
matplotlib.rcParams['font.size'] = FONT_SIZE_PT
matplotlib.rcParams['axes.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['axes.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['figure.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['xtick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['ytick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.fontsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.title_fontsize'] = FONT_SIZE_PT

matplotlib.rcParams['xtick.major.size'] = matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = matplotlib.rcParams['ytick.major.width'] = 0.5


matplotlib.rcParams['xtick.minor.size'] = matplotlib.rcParams['ytick.minor.size'] = 1

matplotlib.rcParams['xtick.minor.width'] = matplotlib.rcParams['ytick.minor.width'] = 0.5

matplotlib.rcParams['axes.linewidth'] = 0.5
matplotlib.rcParams['lines.linewidth'] = 0.5
matplotlib.rcParams['grid.linewidth'] = 0.25
matplotlib.rcParams['patch.linewidth'] = 0.25
matplotlib.rcParams['lines.markeredgewidth'] = 0.25
matplotlib.rcParams['lines.markersize'] = 2

FIVE_MM_IN_INCH = 0.19685
DPI = 600
matplotlib.rcParams['figure.figsize'] = (10 * FIVE_MM_IN_INCH, 9 * FIVE_MM_IN_INCH)
matplotlib.rcParams['savefig.dpi'] = DPI
matplotlib.rcParams['figure.dpi'] = DPI // 4


#http://phyletica.org/matplotlib-fonts/
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# (05) Excel output

## Configuration

In [2]:
import pathlib

INPUT_MARCS_DATA = pathlib.Path('outputs') / '02-linking-to-MARCS' / '03-mapping-between-chip-ms-and-marcs-features.csv'
assert INPUT_MARCS_DATA.is_file()

INPUT_GENE_LABEL_MAP = pathlib.Path('outputs') / '02-linking-to-MARCS' / '02-marcs-to-chip-ms-mapping.csv'
assert INPUT_GENE_LABEL_MAP.is_file()

INPUT_MODEL_RESULTS = pathlib.Path('outputs') / '03-transformation-and-modelling' / '07-output-full_results.csv'
assert INPUT_MODEL_RESULTS.is_file()

INPUT_CHIP_MS_METADATA = pathlib.Path('outputs') / '01-extracting' / 'data_metadata.csv'
assert INPUT_CHIP_MS_METADATA.is_file()

INPUT_NUMERIC_DATA = pathlib.Path('outputs') / '01-extracting' / 'data_numeric.csv'
assert INPUT_NUMERIC_DATA.is_file()

In [3]:
MODEL_COEFS = [
    'H3K4me1vsControl',
    'H3K4me3vsControl',
    'H3K4me3vsH3K4me1',
]

MARCS_FEATURE_ORDER = [
    # Same order as in Fig 3
    'H2A.Z', 'meDNA', 
    'H3K4me1', 'H3K4me3', 'H3ac', 'H3K9acK14ac', 'H3K27ac', 
    'H3K9me2', 'H3K9me3', 'H3K27me2', 'H3K27me3',
    'H4ac', 'H4K16ac', 'H4K20me2', 'H4K20me3',         
]

In [4]:
import pathlib
OUTPUT_DIRECTORY = pathlib.Path('outputs') / '05-excel-output'

if not OUTPUT_DIRECTORY.is_dir():
    OUTPUT_DIRECTORY.mkdir(parents=True)

## Reading

### MARCS

We only need the prepared feature data from previous notebooks

In [5]:
data_marcs_features = pd.read_csv(INPUT_MARCS_DATA, index_col=0)
data_marcs_features.columns = pd.MultiIndex.from_tuples([c.split('__') for c in data_marcs_features.columns], names=['marcs_stat', 'marcs_feature'])
data_marcs_features

marcs_stat,95% CI (+/-),95% CI (+/-),95% CI (+/-),95% CI (+/-),95% CI (+/-),95% CI (+/-),95% CI (+/-),95% CI (+/-),95% CI (+/-),95% CI (+/-),...,t statistic (moderated),t statistic (moderated),t statistic (moderated),t statistic (moderated),t statistic (moderated),t statistic (moderated),t statistic (moderated),t statistic (moderated),t statistic (moderated),t statistic (moderated)
marcs_feature,H2A.Z,H3K27ac,H3K27me2,H3K27me3,H3K4me1,H3K4me3,H3K9acK14ac,H3K9me2,H3K9me3,H3ac,...,H3K4me3,H3K9acK14ac,H3K9me2,H3K9me3,H3ac,H4K16ac,H4K20me2,H4K20me3,H4ac,meDNA
chip_ms_label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
ACIN1,0.211802,0.457351,0.200362,0.211652,0.269613,0.251001,0.145687,0.180162,0.231665,0.162215,...,2.950435,-2.996371,1.970440,-3.132679,-3.998931,-1.325173,-4.314919,3.179548,1.586292,-0.650315
ACTA2,0.136765,0.159660,0.401742,0.105261,0.102875,0.115300,0.140245,0.400128,0.147988,0.144921,...,2.974474,20.652988,-1.408574,0.621017,22.407837,2.748171,4.271652,0.000565,6.184322,-4.371167
ACTB,0.149708,0.228745,0.225918,0.114453,0.111949,0.152387,0.137878,0.179468,0.148311,0.180897,...,2.222446,20.864091,1.126476,0.931822,17.952188,3.376827,3.599880,-0.312255,6.602180,-7.888528
ACTL6A,0.282852,0.143716,0.220205,0.081028,0.071553,0.118745,0.128804,0.231306,0.093425,0.162082,...,2.972668,19.593866,-1.219388,-0.122811,16.775046,2.850633,3.277666,-0.078187,7.699869,-13.615176
ADAR,0.138593,0.205294,0.137140,0.089515,0.120810,0.182271,0.102996,0.147409,0.089110,0.101751,...,2.751459,-4.002974,1.212874,-1.611738,-5.584594,-0.795835,-4.750740,-0.597883,-1.235132,-2.406844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SSB,,,,,,,,,0.106573,,...,,,,0.606704,,,,,,
TIA1,,,,,,,,,0.109572,,...,,,,2.377367,,,,,,
TIAL1,,,,,,,,,0.109572,,...,,,,2.377367,,,,,,
XPO1,,,,,,,,,0.564072,,...,,,,-0.327175,,,-0.471495,1.681556,,


### Metadata

In [6]:
data_metadata_chip_ms = pd.read_csv(INPUT_CHIP_MS_METADATA, index_col=0)
data_metadata_chip_ms

Unnamed: 0_level_0,# PSMs,# Peptides,# Unique Peptides,Accession,Coverage [%],Description,Gene
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GBE1,2,2,2,Q04446,3,"1,4-alpha-glucan-branching enzyme OS=Homo sapi...",GBE1
EFTUD2,44,15,14,Q15029,23,116 kDa U5 small nuclear ribonucleoprotein com...,EFTUD2
YWHAB,44,9,4,P31946,43,14-3-3 protein beta/alpha OS=Homo sapiens OX=9...,YWHAB
YWHAE,53,11,8,P62258,49,14-3-3 protein epsilon OS=Homo sapiens OX=9606...,YWHAE
YWHAH,38,9,5,Q04917,42,14-3-3 protein eta OS=Homo sapiens OX=9606 GN=...,YWHAH
...,...,...,...,...,...,...,...
ZNF800,9,4,4,Q2TB10,9,Zinc finger protein 800 OS=Homo sapiens OX=960...,ZNF800
ZRANB2,6,4,4,O95218,10,Zinc finger Ran-binding domain-containing prot...,ZRANB2
ZFR,3,2,2,Q96KR1,4,Zinc finger RNA-binding protein OS=Homo sapien...,ZFR
AZGP1,36,4,4,P25311,18,Zinc-alpha-2-glycoprotein OS=Homo sapiens OX=9...,AZGP1


## Gene label map

In [7]:
data_gene_label_map = pd.read_csv(INPUT_GENE_LABEL_MAP, index_col=0)
data_gene_label_map

Unnamed: 0,marcs_gene_label,chip_ms_label,mapped_via
0,ACIN1 (1),ACIN1,Accession;Gene name
1,ACIN1 (2),ACIN1,Gene name
2,ACIN1 (3),ACIN1,Gene name
3,ACIN1 (4),ACIN1,Gene name
4,ACTB,ACTB,Accession;Gene name
...,...,...,...
568,ZMYND11,ZMYND11,Accession;Gene name
569,ZNF326,ZNF326,Accession;Gene name
570,ZNF512,ZNF512,Accession;Gene name
571,ZNF512B,ZNF512B,Accession;Gene name


Make the ChIP-MS <-> MARCS gene label map centred on the ChIP-MS labels, by concatenating multiple mappings via ';'

In [8]:
data_gene_label_map_gene_centric = pd.DataFrame({
    'marcs_gene_label': data_gene_label_map.groupby('chip_ms_label')['marcs_gene_label'].apply(lambda x: ';'.join(sorted(x.unique()))),
    # Split the already ';' separated 'mapped_via' column, and then re-join it after deduplicating
    'mapped_via': data_gene_label_map.groupby('chip_ms_label')['mapped_via'].apply(lambda x: ';'.join(sorted(x.str.split(';', expand=True).stack().unique())))
})
data_gene_label_map_gene_centric

Unnamed: 0_level_0,marcs_gene_label,mapped_via
chip_ms_label,Unnamed: 1_level_1,Unnamed: 2_level_1
ACIN1,ACIN1 (1);ACIN1 (2);ACIN1 (3);ACIN1 (4),Accession;Gene name
ACTA2,"ACT[A1,A2,C1,G2]",Accession;Gene name
ACTB,ACTB,Accession;Gene name
ACTBL2,ACTBL2,Accession;Gene name
ACTL6A,ACTL6A,Accession;Gene name
...,...,...
ZMYND11,ZMYND11,Accession;Gene name
ZNF326,ZNF326,Accession;Gene name
ZNF512,ZNF512,Accession;Gene name
ZNF512B,ZNF512B,Accession;Gene name


In [9]:
data_gene_label_map_gene_centric['mapped_via'].value_counts()

Accession;Gene name    512
Accession               10
Gene name                5
Name: mapped_via, dtype: int64

### Unnormalised data

In [10]:
data_unnormalised_numeric = pd.read_csv(
    INPUT_NUMERIC_DATA,
    index_col=0
)
data_unnormalised_numeric.columns.name = 'Experiment_Replicate'
data_unnormalised_numeric

Experiment_Replicate,H3_1,H3_2,H3_3,H4_1,H4_2,H4_3,H3K4me3_1,H3K4me3_2,H3K4me3_3,H3K4me1_1,H3K4me1_2,H3K4me1_3
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
GBE1,,,,,6.244600e+04,,,6.454938e+04,,,,
EFTUD2,2.522426e+05,1.730602e+05,1.584524e+05,1.189396e+05,6.677202e+05,4.115486e+04,5.686581e+05,2.978815e+05,5.589060e+05,1.053073e+06,,1.746669e+05
YWHAB,2.472521e+05,3.426881e+05,9.039318e+05,2.274400e+05,5.873763e+05,2.250794e+05,3.055163e+05,4.152821e+05,8.572419e+05,8.089974e+05,4.741039e+05,1.421636e+05
YWHAE,3.443775e+05,1.929841e+05,,3.170019e+05,3.192067e+05,4.202944e+05,2.851882e+05,3.572551e+05,7.482891e+05,4.671955e+05,3.569694e+05,4.508984e+05
YWHAH,1.238786e+08,1.892759e+08,5.117897e+08,6.823328e+07,1.175261e+08,7.435374e+07,8.311959e+07,8.932719e+07,1.318390e+08,5.130819e+07,7.274683e+07,1.709113e+08
...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF800,1.516332e+06,9.484857e+05,7.395190e+05,2.055031e+05,7.753342e+05,5.426501e+05,2.734432e+06,1.524918e+06,3.320131e+06,1.976328e+06,1.619403e+06,2.126770e+06
ZRANB2,1.354622e+05,1.981103e+05,,,2.034418e+05,1.775103e+05,8.948932e+04,1.117538e+05,4.396195e+05,3.247958e+05,,
ZFR,,,,3.751825e+05,3.159557e+05,,,6.455933e+05,1.824670e+06,1.449104e+06,7.430923e+05,6.052774e+05
AZGP1,7.092820e+06,7.777356e+06,1.442498e+07,4.493467e+06,6.012604e+06,5.489928e+06,4.363090e+06,5.296675e+06,8.245945e+06,9.541946e+06,1.197661e+07,1.351196e+07


### Model outputs

Let's read the model outputs:

In [11]:
data_model_outputs = pd.read_csv(INPUT_MODEL_RESULTS, index_col=0)
data_model_outputs.columns = pd.MultiIndex.from_tuples([c.split('__') for c in data_model_outputs.columns])
data_model_outputs

Unnamed: 0_level_0,normalised_data,normalised_data,normalised_data,normalised_data,normalised_data,normalised_data,normalised_data,normalised_data,normalised_data,normalised_data,...,coefficient_estimates,H3K4me1vsControl,H3K4me1vsControl,H3K4me1vsControl,H3K4me3vsControl,H3K4me3vsControl,H3K4me3vsControl,H3K4me3vsH3K4me1,H3K4me3vsH3K4me1,H3K4me3vsH3K4me1
Unnamed: 0_level_1,H3_1,H3_2,H3_3,H4_1,H4_2,H4_3,H3K4me3_1,H3K4me3_2,H3K4me3_3,H3K4me1_1,...,ExperimentH3K4me3,logFC_imputed,logFC_is_imputed,logFC_based_on_single_datapoint,logFC_imputed,logFC_is_imputed,logFC_based_on_single_datapoint,logFC_imputed,logFC_is_imputed,logFC_based_on_single_datapoint
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
GBE1,,,,,15.376077,,,17.325027,,,...,,,,,,,,,,
EFTUD2,17.529162,17.503392,16.539555,16.318518,18.794640,14.775579,20.041195,19.531290,19.353650,20.006004,...,19.642045,1.779547,False,False,2.731904,False,False,0.952357,False,False
YWHAB,17.500332,18.489014,19.051719,17.253775,18.609681,17.226879,19.144883,20.010644,19.970746,19.625605,...,19.708758,0.496726,False,False,1.686858,False,False,1.190132,False,False
YWHAE,17.978340,17.660600,,17.732780,17.729887,18.127845,19.045548,19.793507,19.774640,18.833497,...,19.537898,0.831718,False,False,1.696411,False,False,0.864693,False,False
YWHAH,26.469061,27.598393,28.196840,25.482620,26.254161,25.594706,27.232678,27.759508,27.235605,25.612516,...,27.409264,-0.254029,False,False,0.809967,False,False,1.063995,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF800,20.116863,19.957744,18.762092,17.107449,19.010215,18.496467,22.306802,21.887212,21.924212,20.914221,...,22.039409,1.931435,False,False,3.130937,False,False,1.199502,False,False
ZRANB2,16.632241,17.698422,,,17.080012,16.884347,17.373421,18.116876,19.007300,18.309003,...,18.165866,1.235248,False,True,1.092110,False,False,-0.143138,False,True
ZFR,,,,17.975881,17.715118,,,20.647178,21.060608,20.466560,...,20.853893,1.866238,True,False,3.008393,True,False,1.142155,False,False
AZGP1,22.342637,22.993326,23.047930,21.558046,21.965314,21.835160,22.980912,23.683567,23.236657,23.185682,...,23.300378,1.158054,False,False,1.009976,False,False,-0.148077,False,False


In [12]:
full_model_data_columns = data_model_outputs.columns.get_level_values(0).unique()
full_model_data_columns

Index(['normalised_data', 'comment', 'H3K4me1vsControl', 'H3K4me3vsControl',
       'H3K4me3vsH3K4me1', 'coefficient_estimates'],
      dtype='object')

And split the normalised data away from the coefficients, away from other data

In [13]:
data_model_outputs_normalised_data = data_model_outputs['normalised_data']
data_model_outputs_coefficients = data_model_outputs['coefficient_estimates']
data_model_outputs_estimates = data_model_outputs[full_model_data_columns.difference(['normalised_data', 'coefficient_estimates'])]

In [14]:
data_model_outputs_normalised_data

Unnamed: 0_level_0,H3_1,H3_2,H3_3,H4_1,H4_2,H4_3,H3K4me3_1,H3K4me3_2,H3K4me3_3,H3K4me1_1,H3K4me1_2,H3K4me1_3
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
GBE1,,,,,15.376077,,,17.325027,,,,
EFTUD2,17.529162,17.503392,16.539555,16.318518,18.794640,14.775579,20.041195,19.531290,19.353650,20.006004,,17.373371
YWHAB,17.500332,18.489014,19.051719,17.253775,18.609681,17.226879,19.144883,20.010644,19.970746,19.625605,18.853954,17.076317
YWHAE,17.978340,17.660600,,17.732780,17.729887,18.127845,19.045548,19.793507,19.774640,18.833497,18.444551,18.741568
YWHAH,26.469061,27.598393,28.196840,25.482620,26.254161,25.594706,27.232678,27.759508,27.235605,25.612516,26.115492,27.307797
...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF800,20.116863,19.957744,18.762092,17.107449,19.010215,18.496467,22.306802,21.887212,21.924212,20.914221,20.626141,20.979358
ZRANB2,16.632241,17.698422,,,17.080012,16.884347,17.373421,18.116876,19.007300,18.309003,,
ZFR,,,,17.975881,17.715118,,,20.647178,21.060608,20.466560,19.502292,19.166362
AZGP1,22.342637,22.993326,23.047930,21.558046,21.965314,21.835160,22.980912,23.683567,23.236657,23.185682,23.512826,23.646859


In [15]:
data_model_outputs_coefficients

Unnamed: 0_level_0,ExperimentH4,ExperimentH3,ExperimentH3K4me1,ExperimentH3K4me3
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GBE1,,,,
EFTUD2,16.629579,17.190703,18.689688,19.642045
YWHAB,17.696778,18.347022,18.518626,19.708758
YWHAE,17.863504,17.819470,18.673205,19.537898
YWHAH,25.777163,27.421432,26.345268,27.409264
...,...,...,...,...
ZNF800,18.204710,19.612233,20.839906,22.039409
ZRANB2,16.982180,17.165331,18.309003,18.165866
ZFR,17.845500,,19.711738,20.853893
AZGP1,21.786173,22.794631,23.448456,23.300378


In [16]:
data_model_outputs_estimates

Unnamed: 0_level_0,H3K4me1vsControl,H3K4me1vsControl,H3K4me1vsControl,H3K4me1vsControl,H3K4me1vsControl,H3K4me1vsControl,H3K4me1vsControl,H3K4me1vsControl,H3K4me1vsControl,H3K4me3vsControl,...,H3K4me3vsH3K4me1,H3K4me3vsH3K4me1,H3K4me3vsH3K4me1,H3K4me3vsH3K4me1,H3K4me3vsH3K4me1,H3K4me3vsH3K4me1,H3K4me3vsH3K4me1,H3K4me3vsH3K4me1,H3K4me3vsH3K4me1,comment
Unnamed: 0_level_1,logFC,t,P.Value,adj.P.Val,B,significant,logFC_imputed,logFC_is_imputed,logFC_based_on_single_datapoint,logFC,...,logFC,t,P.Value,adj.P.Val,B,significant,logFC_imputed,logFC_is_imputed,logFC_based_on_single_datapoint,comment
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
GBE1,,,,,,,,,,,...,,,,,,,,,,Insufficient number of non-null values
EFTUD2,1.779547,1.822335,0.101615,0.178518,-5.525210,False,1.779547,False,False,2.731904,...,0.952357,0.872295,0.405643,0.654289,-6.275122,False,0.952357,False,False,
YWHAB,0.496726,0.858317,0.410759,0.516841,-6.837264,False,0.496726,False,False,1.686858,...,1.190132,1.780971,0.105170,0.300144,-5.290687,False,1.190132,False,False,
YWHAE,0.831718,3.477230,0.006686,0.025468,-2.939244,True,0.831718,False,False,1.696411,...,0.864693,3.258598,0.009512,0.076008,-2.939485,False,0.864693,False,False,
YWHAH,-0.254029,-0.574219,0.578469,0.663142,-7.048612,False,-0.254029,False,False,0.809967,...,1.063995,2.082884,0.063805,0.223445,-4.831719,False,1.063995,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF800,1.931435,4.590035,0.000987,0.008275,-1.045124,True,1.931435,False,False,3.130937,...,1.199502,2.468696,0.033113,0.150813,-4.207841,False,1.199502,False,False,
ZRANB2,1.235248,1.802913,0.121215,0.203563,-5.280518,False,1.235248,False,True,1.092110,...,-0.143138,-0.202284,0.846347,0.922736,-6.420724,False,-0.143138,False,True,logFC(H3K4me1vsControl) estimation was based o...
ZFR,,,,,,False,1.866238,True,False,,...,1.142155,2.625751,0.039099,0.168944,-4.056730,False,1.142155,False,False,logFC(H3K4me1vsControl) estimation failed and ...
AZGP1,1.158054,4.921075,0.000562,0.006275,-0.473142,True,1.158054,False,False,1.009976,...,-0.148077,-0.544942,0.597451,0.800547,-6.625636,False,-0.148077,False,False,


## Collecting the data for output

### ChIP-MS Data

It always makes sense to start with metadata

In [17]:
excel_metadata = data_metadata_chip_ms.copy()

# Copy the label into metadata so we don't nee to reset index
excel_metadata['Label'] = excel_metadata.index

# also add MARCS gene label info
excel_metadata = excel_metadata.join(data_gene_label_map_gene_centric)

# Some columns should be before others
_first_columns = ['Label', 'Accession', 'Gene', 'marcs_gene_label', 'mapped_via', 'Description'] 
excel_metadata = excel_metadata[_first_columns + list(excel_metadata.columns.difference(_first_columns))]

# Rename marcs gene label column
excel_metadata = excel_metadata.rename(columns={"marcs_gene_label": 'MARCS Gene label(s)', 'mapped_via': "MARCS Gene label(s) linked via"})

# Add a header columns
excel_metadata.columns = pd.MultiIndex.from_tuples([('metadata', c) for c in excel_metadata.columns])

excel_metadata

Unnamed: 0_level_0,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata
Unnamed: 0_level_1,Label,Accession,Gene,MARCS Gene label(s),MARCS Gene label(s) linked via,Description,# PSMs,# Peptides,# Unique Peptides,Coverage [%]
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
GBE1,GBE1,Q04446,GBE1,,,"1,4-alpha-glucan-branching enzyme OS=Homo sapi...",2,2,2,3
EFTUD2,EFTUD2,Q15029,EFTUD2,EFTUD2,Accession;Gene name,116 kDa U5 small nuclear ribonucleoprotein com...,44,15,14,23
YWHAB,YWHAB,P31946,YWHAB,,,14-3-3 protein beta/alpha OS=Homo sapiens OX=9...,44,9,4,43
YWHAE,YWHAE,P62258,YWHAE,,,14-3-3 protein epsilon OS=Homo sapiens OX=9606...,53,11,8,49
YWHAH,YWHAH,Q04917,YWHAH,,,14-3-3 protein eta OS=Homo sapiens OX=9606 GN=...,38,9,5,42
...,...,...,...,...,...,...,...,...,...,...
ZNF800,ZNF800,Q2TB10,ZNF800,,,Zinc finger protein 800 OS=Homo sapiens OX=960...,9,4,4,9
ZRANB2,ZRANB2,O95218,ZRANB2,ZRANB2,Accession;Gene name,Zinc finger Ran-binding domain-containing prot...,6,4,4,10
ZFR,ZFR,Q96KR1,ZFR,ZFR,Accession;Gene name,Zinc finger RNA-binding protein OS=Homo sapien...,3,2,2,4
AZGP1,AZGP1,P25311,AZGP1,,,Zinc-alpha-2-glycoprotein OS=Homo sapiens OX=9...,36,4,4,18


#### Raw Data

Now let's shape the raw data into something more useful too

In [18]:
excel_raw_data = data_unnormalised_numeric.copy()
# Add a header column
excel_raw_data.columns = pd.MultiIndex.from_tuples([('raw_data', c) for c in excel_raw_data.columns])

excel_raw_data

Unnamed: 0_level_0,raw_data,raw_data,raw_data,raw_data,raw_data,raw_data,raw_data,raw_data,raw_data,raw_data,raw_data,raw_data
Unnamed: 0_level_1,H3_1,H3_2,H3_3,H4_1,H4_2,H4_3,H3K4me3_1,H3K4me3_2,H3K4me3_3,H3K4me1_1,H3K4me1_2,H3K4me1_3
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
GBE1,,,,,6.244600e+04,,,6.454938e+04,,,,
EFTUD2,2.522426e+05,1.730602e+05,1.584524e+05,1.189396e+05,6.677202e+05,4.115486e+04,5.686581e+05,2.978815e+05,5.589060e+05,1.053073e+06,,1.746669e+05
YWHAB,2.472521e+05,3.426881e+05,9.039318e+05,2.274400e+05,5.873763e+05,2.250794e+05,3.055163e+05,4.152821e+05,8.572419e+05,8.089974e+05,4.741039e+05,1.421636e+05
YWHAE,3.443775e+05,1.929841e+05,,3.170019e+05,3.192067e+05,4.202944e+05,2.851882e+05,3.572551e+05,7.482891e+05,4.671955e+05,3.569694e+05,4.508984e+05
YWHAH,1.238786e+08,1.892759e+08,5.117897e+08,6.823328e+07,1.175261e+08,7.435374e+07,8.311959e+07,8.932719e+07,1.318390e+08,5.130819e+07,7.274683e+07,1.709113e+08
...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF800,1.516332e+06,9.484857e+05,7.395190e+05,2.055031e+05,7.753342e+05,5.426501e+05,2.734432e+06,1.524918e+06,3.320131e+06,1.976328e+06,1.619403e+06,2.126770e+06
ZRANB2,1.354622e+05,1.981103e+05,,,2.034418e+05,1.775103e+05,8.948932e+04,1.117538e+05,4.396195e+05,3.247958e+05,,
ZFR,,,,3.751825e+05,3.159557e+05,,,6.455933e+05,1.824670e+06,1.449104e+06,7.430923e+05,6.052774e+05
AZGP1,7.092820e+06,7.777356e+06,1.442498e+07,4.493467e+06,6.012604e+06,5.489928e+06,4.363090e+06,5.296675e+06,8.245945e+06,9.541946e+06,1.197661e+07,1.351196e+07


#### Normalised Data

Likewise for normalised data

In [19]:
excel_normalised_data = data_model_outputs_normalised_data.copy()

# Add a header column
excel_normalised_data.columns = pd.MultiIndex.from_tuples([('normalised_data_log2', c) for c in excel_normalised_data.columns])

excel_normalised_data

Unnamed: 0_level_0,normalised_data_log2,normalised_data_log2,normalised_data_log2,normalised_data_log2,normalised_data_log2,normalised_data_log2,normalised_data_log2,normalised_data_log2,normalised_data_log2,normalised_data_log2,normalised_data_log2,normalised_data_log2
Unnamed: 0_level_1,H3_1,H3_2,H3_3,H4_1,H4_2,H4_3,H3K4me3_1,H3K4me3_2,H3K4me3_3,H3K4me1_1,H3K4me1_2,H3K4me1_3
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
GBE1,,,,,15.376077,,,17.325027,,,,
EFTUD2,17.529162,17.503392,16.539555,16.318518,18.794640,14.775579,20.041195,19.531290,19.353650,20.006004,,17.373371
YWHAB,17.500332,18.489014,19.051719,17.253775,18.609681,17.226879,19.144883,20.010644,19.970746,19.625605,18.853954,17.076317
YWHAE,17.978340,17.660600,,17.732780,17.729887,18.127845,19.045548,19.793507,19.774640,18.833497,18.444551,18.741568
YWHAH,26.469061,27.598393,28.196840,25.482620,26.254161,25.594706,27.232678,27.759508,27.235605,25.612516,26.115492,27.307797
...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF800,20.116863,19.957744,18.762092,17.107449,19.010215,18.496467,22.306802,21.887212,21.924212,20.914221,20.626141,20.979358
ZRANB2,16.632241,17.698422,,,17.080012,16.884347,17.373421,18.116876,19.007300,18.309003,,
ZFR,,,,17.975881,17.715118,,,20.647178,21.060608,20.466560,19.502292,19.166362
AZGP1,22.342637,22.993326,23.047930,21.558046,21.965314,21.835160,22.980912,23.683567,23.236657,23.185682,23.512826,23.646859


### MARCS Data

From MARCS data we really only need the MARCS feature classifications.

In [20]:
data_marcs_features[[
    'significant_category_strong',
    'significant_category_weak'
]]

marcs_stat,significant_category_strong,significant_category_strong,significant_category_strong,significant_category_strong,significant_category_strong,significant_category_strong,significant_category_strong,significant_category_strong,significant_category_strong,significant_category_strong,...,significant_category_weak,significant_category_weak,significant_category_weak,significant_category_weak,significant_category_weak,significant_category_weak,significant_category_weak,significant_category_weak,significant_category_weak,significant_category_weak
marcs_feature,H2A.Z,H3K27ac,H3K27me2,H3K27me3,H3K4me1,H3K4me3,H3K9acK14ac,H3K9me2,H3K9me3,H3ac,...,H3K4me3,H3K9acK14ac,H3K9me2,H3K9me3,H3ac,H4K16ac,H4K20me2,H4K20me3,H4ac,meDNA
chip_ms_label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
ACIN1,Neither,Neither,Neither,Neither,Neither,Neither,Neither,Neither,Neither,Neither,...,Neither,Neither,Neither,Neither,Excluded,Neither,Excluded,Neither,Neither,Neither
ACTA2,Neither,Neither,Neither,Neither,Neither,Neither,Strongly recruited,Neither,Neither,Strongly recruited,...,Neither,Recruited,Neither,Neither,Recruited,Neither,Recruited,Neither,Recruited,Excluded
ACTB,Neither,Neither,Neither,Neither,Neither,Neither,Strongly recruited,Neither,Neither,Strongly recruited,...,Neither,Recruited,Neither,Neither,Recruited,Neither,Recruited,Neither,Recruited,Excluded
ACTL6A,Neither,Strongly recruited,Neither,Neither,Neither,Neither,Strongly recruited,Neither,Neither,Strongly recruited,...,Neither,Recruited,Neither,Neither,Recruited,Neither,Neither,Neither,Recruited,Excluded
ADAR,Neither,Neither,Neither,Neither,Neither,Neither,Neither,Neither,Neither,Neither,...,Neither,Excluded,Neither,Neither,Excluded,Neither,Excluded,Neither,Neither,Neither
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SSB,,,,,,,,,Neither,,...,,,,Neither,,,,,,
TIA1,,,,,,,,,Neither,,...,,,,Neither,,,,,,
TIAL1,,,,,,,,,Neither,,...,,,,Neither,,,,,,
XPO1,,,,,,,,,Neither,,...,,,,Neither,,,Neither,Neither,,


For excel we will recode these into shorter counterparts:

- R/E - Recruited/Excluded
- SR/SE - Strongly Recruited/Strongly Excluded
- N - Neither
- (empty) - No Data


In [21]:
def categorise_features_for_excel(row):
    if row[['significant_category_weak', 'significant_category_strong']].isnull().any():
        # One being null should imply both null
        assert row[['significant_category_weak', 'significant_category_strong']].isnull().all()    
        return None
    elif row['significant_category_strong'] != 'Neither':
        return {'Strongly recruited': 'SR', 'Strongly excluded': 'SE'}[row['significant_category_strong']]
    else:
        return {'Recruited': 'R', 'Excluded': 'E', 'Neither': 'N'}[row['significant_category_weak']]
    

_df = data_marcs_features[[
    'significant_category_strong',
    'significant_category_weak'
]].swaplevel(axis='columns')

excel_marcs_features = {}
for marcs_feature in MARCS_FEATURE_ORDER:
    excel_marcs_features[marcs_feature] = _df[marcs_feature].apply(categorise_features_for_excel, axis=1)
    
excel_marcs_features = pd.DataFrame(excel_marcs_features)

# Add two headers as well, why not
excel_marcs_features.columns = pd.MultiIndex.from_tuples([('marcs_feature_effects', c) for c in excel_marcs_features.columns])

### Model outputs

Finally, the model outputs. We will split the outputs as follows:

1. We will take the logFC_imputed columns and place them separtely (as a summary)
2. We will also separate the comment
3. And the coefficient estimates
4. We will keep remaining outputs pretty much as is (separately)

In [22]:
full_model_output_estimates_columns = data_model_outputs_estimates.columns.get_level_values(1).unique()
full_model_output_estimates_columns

Index(['logFC', 't', 'P.Value', 'adj.P.Val', 'B', 'significant',
       'logFC_imputed', 'logFC_is_imputed', 'logFC_based_on_single_datapoint',
       'comment'],
      dtype='object')

In [23]:
excel_model_comment = data_model_outputs_estimates[[('comment', 'comment')]].copy()
excel_model_comment.columns = pd.MultiIndex.from_tuples([('model_estimates', 'comment')])
excel_model_comment

Unnamed: 0_level_0,model_estimates
Unnamed: 0_level_1,comment
Label,Unnamed: 1_level_2
GBE1,Insufficient number of non-null values
EFTUD2,
YWHAB,
YWHAE,
YWHAH,
...,...
ZNF800,
ZRANB2,logFC(H3K4me1vsControl) estimation was based o...
ZFR,logFC(H3K4me1vsControl) estimation failed and ...
AZGP1,


In [24]:
excel_model_coefficients = data_model_outputs_coefficients.copy()
excel_model_coefficients.columns = pd.MultiIndex.from_tuples([('model_coefficient_estimates', c) for c in excel_model_coefficients.columns])
excel_model_coefficients

Unnamed: 0_level_0,model_coefficient_estimates,model_coefficient_estimates,model_coefficient_estimates,model_coefficient_estimates
Unnamed: 0_level_1,ExperimentH4,ExperimentH3,ExperimentH3K4me1,ExperimentH3K4me3
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
GBE1,,,,
EFTUD2,16.629579,17.190703,18.689688,19.642045
YWHAB,17.696778,18.347022,18.518626,19.708758
YWHAE,17.863504,17.819470,18.673205,19.537898
YWHAH,25.777163,27.421432,26.345268,27.409264
...,...,...,...,...
ZNF800,18.204710,19.612233,20.839906,22.039409
ZRANB2,16.982180,17.165331,18.309003,18.165866
ZFR,17.845500,,19.711738,20.853893
AZGP1,21.786173,22.794631,23.448456,23.300378


In [25]:
excel_model_outputs_fc_estimates = data_model_outputs_estimates.loc(axis=1)[MODEL_COEFS, 'logFC_imputed']
excel_model_outputs_fc_estimates.columns = pd.MultiIndex.from_tuples([('model_estimates_for_{}'.format(c[0]), c[1]) for c in excel_model_outputs_fc_estimates.columns])
excel_model_outputs_fc_estimates

Unnamed: 0_level_0,model_estimates_for_H3K4me1vsControl,model_estimates_for_H3K4me3vsControl,model_estimates_for_H3K4me3vsH3K4me1
Unnamed: 0_level_1,logFC_imputed,logFC_imputed,logFC_imputed
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
GBE1,,,
EFTUD2,1.779547,2.731904,0.952357
YWHAB,0.496726,1.686858,1.190132
YWHAE,0.831718,1.696411,0.864693
YWHAH,-0.254029,0.809967,1.063995
...,...,...,...
ZNF800,1.931435,3.130937,1.199502
ZRANB2,1.235248,1.092110,-0.143138
ZFR,1.866238,3.008393,1.142155
AZGP1,1.158054,1.009976,-0.148077


In [26]:
excel_model_outputs_other = data_model_outputs_estimates[MODEL_COEFS]
excel_model_outputs_other = excel_model_outputs_other[[c for c in excel_model_outputs_other.columns if c[1] != 'logFC_imputed']].copy()
excel_model_outputs_other.columns = pd.MultiIndex.from_tuples([('model_estimates_for_{}'.format(c[0]), c[1]) for c in excel_model_outputs_other.columns])
excel_model_outputs_other

Unnamed: 0_level_0,model_estimates_for_H3K4me1vsControl,model_estimates_for_H3K4me1vsControl,model_estimates_for_H3K4me1vsControl,model_estimates_for_H3K4me1vsControl,model_estimates_for_H3K4me1vsControl,model_estimates_for_H3K4me1vsControl,model_estimates_for_H3K4me1vsControl,model_estimates_for_H3K4me1vsControl,model_estimates_for_H3K4me3vsControl,model_estimates_for_H3K4me3vsControl,model_estimates_for_H3K4me3vsControl,model_estimates_for_H3K4me3vsControl,model_estimates_for_H3K4me3vsControl,model_estimates_for_H3K4me3vsH3K4me1,model_estimates_for_H3K4me3vsH3K4me1,model_estimates_for_H3K4me3vsH3K4me1,model_estimates_for_H3K4me3vsH3K4me1,model_estimates_for_H3K4me3vsH3K4me1,model_estimates_for_H3K4me3vsH3K4me1,model_estimates_for_H3K4me3vsH3K4me1,model_estimates_for_H3K4me3vsH3K4me1
Unnamed: 0_level_1,logFC,t,P.Value,adj.P.Val,B,significant,logFC_is_imputed,logFC_based_on_single_datapoint,logFC,t,...,logFC_is_imputed,logFC_based_on_single_datapoint,logFC,t,P.Value,adj.P.Val,B,significant,logFC_is_imputed,logFC_based_on_single_datapoint
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
GBE1,,,,,,,,,,,...,,,,,,,,,,
EFTUD2,1.779547,1.822335,0.101615,0.178518,-5.525210,False,False,False,2.731904,3.230379,...,False,False,0.952357,0.872295,0.405643,0.654289,-6.275122,False,False,False
YWHAB,0.496726,0.858317,0.410759,0.516841,-6.837264,False,False,False,1.686858,2.914806,...,False,False,1.190132,1.780971,0.105170,0.300144,-5.290687,False,False,False
YWHAE,0.831718,3.477230,0.006686,0.025468,-2.939244,True,False,False,1.696411,7.092319,...,False,False,0.864693,3.258598,0.009512,0.076008,-2.939485,False,False,False
YWHAH,-0.254029,-0.574219,0.578469,0.663142,-7.048612,False,False,False,0.809967,1.830888,...,False,False,1.063995,2.082884,0.063805,0.223445,-4.831719,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF800,1.931435,4.590035,0.000987,0.008275,-1.045124,True,False,False,3.130937,7.440640,...,False,False,1.199502,2.468696,0.033113,0.150813,-4.207841,False,False,False
ZRANB2,1.235248,1.802913,0.121215,0.203563,-5.280518,False,False,True,1.092110,2.333371,...,False,False,-0.143138,-0.202284,0.846347,0.922736,-6.420724,False,False,True
ZFR,,,,,,False,True,False,,,...,True,False,1.142155,2.625751,0.039099,0.168944,-4.056730,False,False,False
AZGP1,1.158054,4.921075,0.000562,0.006275,-0.473142,True,False,False,1.009976,4.291830,...,False,False,-0.148077,-0.544942,0.597451,0.800547,-6.625636,False,False,False


## Writing excel

In [27]:
import xlsxwriter
from seaborn.utils import relative_luminance

Collect everything in one place

In [28]:
df_excel = excel_metadata.join(excel_model_outputs_fc_estimates).join(excel_model_comment).join(excel_marcs_features).join(excel_model_outputs_other).join(excel_model_coefficients).join(excel_normalised_data).join(excel_raw_data)
assert not df_excel.index.duplicated().any()
df_excel

Unnamed: 0_level_0,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,...,raw_data,raw_data,raw_data,raw_data,raw_data,raw_data,raw_data,raw_data,raw_data,raw_data
Unnamed: 0_level_1,Label,Accession,Gene,MARCS Gene label(s),MARCS Gene label(s) linked via,Description,# PSMs,# Peptides,# Unique Peptides,Coverage [%],...,H3_3,H4_1,H4_2,H4_3,H3K4me3_1,H3K4me3_2,H3K4me3_3,H3K4me1_1,H3K4me1_2,H3K4me1_3
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
GBE1,GBE1,Q04446,GBE1,,,"1,4-alpha-glucan-branching enzyme OS=Homo sapi...",2,2,2,3,...,,,6.244600e+04,,,6.454938e+04,,,,
EFTUD2,EFTUD2,Q15029,EFTUD2,EFTUD2,Accession;Gene name,116 kDa U5 small nuclear ribonucleoprotein com...,44,15,14,23,...,1.584524e+05,1.189396e+05,6.677202e+05,4.115486e+04,5.686581e+05,2.978815e+05,5.589060e+05,1.053073e+06,,1.746669e+05
YWHAB,YWHAB,P31946,YWHAB,,,14-3-3 protein beta/alpha OS=Homo sapiens OX=9...,44,9,4,43,...,9.039318e+05,2.274400e+05,5.873763e+05,2.250794e+05,3.055163e+05,4.152821e+05,8.572419e+05,8.089974e+05,4.741039e+05,1.421636e+05
YWHAE,YWHAE,P62258,YWHAE,,,14-3-3 protein epsilon OS=Homo sapiens OX=9606...,53,11,8,49,...,,3.170019e+05,3.192067e+05,4.202944e+05,2.851882e+05,3.572551e+05,7.482891e+05,4.671955e+05,3.569694e+05,4.508984e+05
YWHAH,YWHAH,Q04917,YWHAH,,,14-3-3 protein eta OS=Homo sapiens OX=9606 GN=...,38,9,5,42,...,5.117897e+08,6.823328e+07,1.175261e+08,7.435374e+07,8.311959e+07,8.932719e+07,1.318390e+08,5.130819e+07,7.274683e+07,1.709113e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF800,ZNF800,Q2TB10,ZNF800,,,Zinc finger protein 800 OS=Homo sapiens OX=960...,9,4,4,9,...,7.395190e+05,2.055031e+05,7.753342e+05,5.426501e+05,2.734432e+06,1.524918e+06,3.320131e+06,1.976328e+06,1.619403e+06,2.126770e+06
ZRANB2,ZRANB2,O95218,ZRANB2,ZRANB2,Accession;Gene name,Zinc finger Ran-binding domain-containing prot...,6,4,4,10,...,,,2.034418e+05,1.775103e+05,8.948932e+04,1.117538e+05,4.396195e+05,3.247958e+05,,
ZFR,ZFR,Q96KR1,ZFR,ZFR,Accession;Gene name,Zinc finger RNA-binding protein OS=Homo sapien...,3,2,2,4,...,,3.751825e+05,3.159557e+05,,,6.455933e+05,1.824670e+06,1.449104e+06,7.430923e+05,6.052774e+05
AZGP1,AZGP1,P25311,AZGP1,,,Zinc-alpha-2-glycoprotein OS=Homo sapiens OX=9...,36,4,4,18,...,1.442498e+07,4.493467e+06,6.012604e+06,5.489928e+06,4.363090e+06,5.296675e+06,8.245945e+06,9.541946e+06,1.197661e+07,1.351196e+07


Sort by me3vsme1 difference, followed by the other two coefs, descending

In [29]:
df_excel = df_excel.sort_values(by=[
    ('model_estimates_for_H3K4me3vsH3K4me1', 'logFC_imputed'),
    ('model_estimates_for_H3K4me3vsControl', 'logFC_imputed'),
    ('model_estimates_for_H3K4me1vsControl', 'logFC_imputed'),
], ascending=False)

In [30]:
output_xlsx = OUTPUT_DIRECTORY / '01-model-results.xlsx'
output_csv = OUTPUT_DIRECTORY / '01-model-results.csv.gz'

COLUMN_GROUPS = {
    'Metadata': excel_metadata.columns,
    'Log2(FC) estimates (incl. imputed)': list(excel_model_outputs_fc_estimates.columns),
    'Comment': list(excel_model_comment.columns),
    'MARCS Feature Response (SR/SE: Strongly recruited/excluded, R/E: Recruited/Excluded, N: neither)': list(excel_marcs_features.columns),
}

GROUP_DEPTHS = {
    # 'Comment': 2, # Merging the two rows for the comment group makes excel file not sortable
}

for coef in MODEL_COEFS:
    COLUMN_GROUPS[f'Model outputs for {coef}'] = list(excel_model_outputs_other[[f'model_estimates_for_{coef}']].columns)

COLUMN_GROUPS = {
    **COLUMN_GROUPS,
    'Model coefficient estimates': list(excel_model_coefficients),
    'Normalised data (log2)': list(excel_normalised_data),
    'Raw data': list(excel_raw_data),
}

RENAMES = {}

for col in excel_model_outputs_fc_estimates.columns:
    RENAMES[col] = col[0].partition('model_estimates_for_')[2]
    
_simple_renames = []
_simple_renames.extend(excel_metadata.columns)
_simple_renames.extend(excel_marcs_features.columns)
_simple_renames.extend(excel_model_outputs_other.columns)
_simple_renames.extend(excel_model_coefficients.columns)
_simple_renames.extend(excel_normalised_data.columns)
_simple_renames.extend(excel_raw_data.columns)
_simple_renames.extend(excel_model_comment.columns)

for col in _simple_renames:
    RENAMES[col] = col[1]
    

writer = pd.ExcelWriter(output_xlsx, engine='xlsxwriter')
workbook = writer.book

bold = workbook.add_format({'bold': True})
bold_right = workbook.add_format({'bold': True, 'right': 1})

bold_rotated = workbook.add_format({'bold': True, 'rotation':90})
bold_rotated_right = workbook.add_format({'bold': True, 'rotation':90, 'right': 1})

merged_format = workbook.add_format({
    'bold': 1,
    'align': 'center',
    'valign': 'vcenter',
    'right': 1,
    'text_wrap': True,
})

right_border = workbook.add_format({
    'right': 1,
})

sheet_name = f"Summary"

first_data_row = 2
first_data_col = 0

# pandas does not support writing to Excel with multi-index and header=False
# so quickly remove the index, from the DF we're writing
# (it won't make a difference as we will handle header columns ourselves)
_df_excel_nomultiindex = df_excel.copy()
_df_excel_nomultiindex.columns = ['__'.join(map(str, c)) for c in df_excel.columns]
# Write this df as csv:
_df_excel_nomultiindex.to_csv(output_csv)
# And into excel
_df_excel_nomultiindex.to_excel(
    writer, 
    sheet_name=sheet_name, 
    startrow=first_data_row, 
    startcol=first_data_col, 
    index=False, 
    header=False
)

last_data_row = first_data_row + len(df_excel)
last_data_col = first_data_col + len(df_excel.columns)

worksheet = writer.sheets[sheet_name]

SEPARATOR_COLUMNS = {v[-1] for v in COLUMN_GROUPS.values()}

colname_to_index_map = {}
for i, col in enumerate(df_excel.columns, start=first_data_col):
    fmt_ = bold_rotated if not col in SEPARATOR_COLUMNS else bold_rotated_right
    
    worksheet.write(first_data_row-1, i, RENAMES.get(col, str(col)), fmt_)
    colname_to_index_map[col] = i

for merged_name, col_list in COLUMN_GROUPS.items():
    _first = colname_to_index_map[col_list[0]]
    _last = colname_to_index_map[col_list[-1]]
    
    rows_to_merge = GROUP_DEPTHS.get(merged_name, 1)
    
    if _first == _last and rows_to_merge == 1:
        # Cannot merge one column only
        worksheet.write(first_data_row-2, _first, merged_name, merged_format)
    else:
        worksheet.merge_range(
            first_data_row-2, colname_to_index_map[col_list[0]], 
            first_data_row-2+(rows_to_merge-1), colname_to_index_map[col_list[-1]],
            merged_name,
            merged_format
        )
        
for col in SEPARATOR_COLUMNS:
    worksheet.set_column(colname_to_index_map[col], colname_to_index_map[col], cell_format=right_border)

color_red = '#b2182b'
format_red = workbook.add_format({'bg_color': color_red})
color_red_lighter = '#f4a582'

color_white = '#f7f7f7'

color_blue = '#2166ac'
format_blue = workbook.add_format({'bg_color': color_blue})
color_blue_lighter = '#92c5de'

for (val, color) in [('SR', color_red), ('SE', color_blue), ('R', color_red_lighter), ('E', color_blue_lighter)]:
    for col in COLUMN_GROUPS['MARCS Feature Response (SR/SE: Strongly recruited/excluded, R/E: Recruited/Excluded, N: neither)']:
        fmt_ = workbook.add_format({
            'bg_color': color,
            'font_color': "#000000" if relative_luminance(color) > .408 else "#FFFFFF"
        })

        worksheet.conditional_format(
            first_data_row, colname_to_index_map[col], 
            last_data_row, colname_to_index_map[col],
            {
                'type': 'cell',
                'criteria': 'equal to',
                'value': f'"{val}"',
                'format': fmt_,
            }
        )
       
    
three_color_columns = list(COLUMN_GROUPS['Log2(FC) estimates (incl. imputed)'])
three_color_columns += [c for c in excel_model_outputs_other.columns if c[1] == 'logFC']


for col in three_color_columns:
    
    _limit = df_excel[col].abs().replace(np.inf, np.nan).quantile(0.98)
    print(f"Excel formatting limit for {col}: +/-{_limit}")
    
    worksheet.conditional_format(
        first_data_row, colname_to_index_map[col], 
        last_data_row, colname_to_index_map[col],
        {
            'type': '3_color_scale',
            'min_type': 'num',
            'max_type': 'num',
            'mid_type': 'num',
            'min_value': -_limit,
            'max_value': _limit,
            'mid_value': 0,
            'max_color': color_red,
            'mid_color': color_white,
            'min_color': color_blue,

        }
    )
    
    worksheet.conditional_format(
        first_data_row, colname_to_index_map[col], 
        last_data_row, colname_to_index_map[col], 
        {'type': 'cell',
        'criteria': '==',
        'value': '"inf"',
        'format': format_red}
    )
    
    worksheet.conditional_format(
        first_data_row, colname_to_index_map[col], 
        last_data_row, colname_to_index_map[col], 
        {'type': 'cell',
        'criteria': '==',
        'value': '"-inf"',
        'format': format_blue}
    )
    
databar_columns = list(COLUMN_GROUPS['Model coefficient estimates']) + list(excel_normalised_data.columns) + list(excel_raw_data.columns) 

for col in databar_columns:
    
    worksheet.conditional_format(
        first_data_row, colname_to_index_map[col], 
        last_data_row, colname_to_index_map[col],
        {
            'type': 'data_bar',
            'min_type': 'percentile',
            'max_type': 'percentile',
            'min_value': 1,
            'max_value': 99,

        }
    )
    

boolean_columns_good = [c for c in excel_model_outputs_other.columns if c[1] in ['significant']]
boolean_columns_bad = [c for c in excel_model_outputs_other.columns if c[1] in ['logFC_is_imputed', 'logFC_based_on_single_datapoint']]

bold_green = workbook.add_format({'bold': True, 'font_color': '#1b7837'})
bold_red = workbook.add_format({'bold': True, 'font_color': color_red})
for col in boolean_columns_good:
    worksheet.conditional_format(
        first_data_row, colname_to_index_map[col], 
        last_data_row, colname_to_index_map[col], 
        {'type': 'cell',
        'criteria': '==',
        'value': 'TRUE',
        'format': bold_green}
    )

for col in boolean_columns_bad:
    worksheet.conditional_format(
        first_data_row, colname_to_index_map[col], 
        last_data_row, colname_to_index_map[col], 
        {'type': 'cell',
        'criteria': '==',
        'value': 'TRUE',
        'format': bold_red}
    )
    
# Narrow columns
narrow_columns = list(excel_marcs_features.columns)
narrow_columns.extend([c for c in data_metadata_chip_ms if c[1] in ['# PSMs', '# Peptides', '# Unique Peptides', 'Coverage [%]']])
    
for col in narrow_columns:
    worksheet.set_column(colname_to_index_map[col], colname_to_index_map[col], 4)

worksheet.set_column(colname_to_index_map[('metadata', 'Description')], colname_to_index_map[('metadata', 'Description')], 40)
worksheet.set_column(colname_to_index_map[('model_estimates', 'comment')], colname_to_index_map[('model_estimates', 'comment')], 40)

worksheet.freeze_panes(first_data_row, colname_to_index_map[('metadata', 'MARCS Gene label(s)')] + 1)
worksheet.autofilter(first_data_row-1, first_data_col, last_data_row, last_data_col)
      
writer.save()
print("Done!")

Excel formatting limit for ('model_estimates_for_H3K4me1vsControl', 'logFC_imputed'): +/-2.798055602273705
Excel formatting limit for ('model_estimates_for_H3K4me3vsControl', 'logFC_imputed'): +/-3.5651393477088296
Excel formatting limit for ('model_estimates_for_H3K4me3vsH3K4me1', 'logFC_imputed'): +/-2.637856239138143
Excel formatting limit for ('model_estimates_for_H3K4me1vsControl', 'logFC'): +/-2.5994575945601377
Excel formatting limit for ('model_estimates_for_H3K4me3vsControl', 'logFC'): +/-3.202340388784688
Excel formatting limit for ('model_estimates_for_H3K4me3vsH3K4me1', 'logFC'): +/-2.637856239138143
Done!


In [31]:
excel_metadata

Unnamed: 0_level_0,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata
Unnamed: 0_level_1,Label,Accession,Gene,MARCS Gene label(s),MARCS Gene label(s) linked via,Description,# PSMs,# Peptides,# Unique Peptides,Coverage [%]
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
GBE1,GBE1,Q04446,GBE1,,,"1,4-alpha-glucan-branching enzyme OS=Homo sapi...",2,2,2,3
EFTUD2,EFTUD2,Q15029,EFTUD2,EFTUD2,Accession;Gene name,116 kDa U5 small nuclear ribonucleoprotein com...,44,15,14,23
YWHAB,YWHAB,P31946,YWHAB,,,14-3-3 protein beta/alpha OS=Homo sapiens OX=9...,44,9,4,43
YWHAE,YWHAE,P62258,YWHAE,,,14-3-3 protein epsilon OS=Homo sapiens OX=9606...,53,11,8,49
YWHAH,YWHAH,Q04917,YWHAH,,,14-3-3 protein eta OS=Homo sapiens OX=9606 GN=...,38,9,5,42
...,...,...,...,...,...,...,...,...,...,...
ZNF800,ZNF800,Q2TB10,ZNF800,,,Zinc finger protein 800 OS=Homo sapiens OX=960...,9,4,4,9
ZRANB2,ZRANB2,O95218,ZRANB2,ZRANB2,Accession;Gene name,Zinc finger Ran-binding domain-containing prot...,6,4,4,10
ZFR,ZFR,Q96KR1,ZFR,ZFR,Accession;Gene name,Zinc finger RNA-binding protein OS=Homo sapien...,3,2,2,4
AZGP1,AZGP1,P25311,AZGP1,,,Zinc-alpha-2-glycoprotein OS=Homo sapiens OX=9...,36,4,4,18
