In [1]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
sns.set_palette('Dark2')
sns.set_context('paper')
sns.set_style({'axes.axisbelow': True, 
               'axes.edgecolor': '.15',
               'axes.facecolor': 'white',
               'axes.grid': True, 
               'axes.labelcolor': '.15', 
               'figure.facecolor': 'white', 
               'grid.color': '.15',
               'grid.linestyle': ':', 
               'grid.alpha': .5, 
               'image.cmap': 'Greys', 
               'legend.frameon': False, 
               'legend.numpoints': 1, 
               'legend.scatterpoints': 1,
               'lines.solid_capstyle': 'butt', 
               'axes.spines.right': False, 
               'axes.spines.top': False,  
               'text.color': '.15',  
               'xtick.top': False, 
               'ytick.right': False, 
               'xtick.color': '.15',
               'xtick.direction': 'out', 
               'ytick.color': '.15', 
               'ytick.direction': 'out', 
              })


import matplotlib

FONT_SIZE_PT = 5
matplotlib.rcParams['font.family'] = 'Arial'
matplotlib.rcParams['font.size'] = FONT_SIZE_PT
matplotlib.rcParams['axes.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['axes.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['figure.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['xtick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['ytick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.fontsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.title_fontsize'] = FONT_SIZE_PT

matplotlib.rcParams['xtick.major.size'] = matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = matplotlib.rcParams['ytick.major.width'] = 0.5


matplotlib.rcParams['xtick.minor.size'] = matplotlib.rcParams['ytick.minor.size'] = 1

matplotlib.rcParams['xtick.minor.width'] = matplotlib.rcParams['ytick.minor.width'] = 0.5

matplotlib.rcParams['axes.linewidth'] = 0.5
matplotlib.rcParams['lines.linewidth'] = 0.5
matplotlib.rcParams['grid.linewidth'] = 0.25
matplotlib.rcParams['patch.linewidth'] = 0.25
matplotlib.rcParams['lines.markeredgewidth'] = 0.25
matplotlib.rcParams['lines.markersize'] = 2

FIVE_MM_IN_INCH = 0.19685
DPI = 600
matplotlib.rcParams['figure.figsize'] = (10 * FIVE_MM_IN_INCH, 9 * FIVE_MM_IN_INCH)
matplotlib.rcParams['savefig.dpi'] = DPI
matplotlib.rcParams['figure.dpi'] = DPI // 4


#http://phyletica.org/matplotlib-fonts/
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# (01) Extracting the dataset.

This notebook handles the first step of analysis - namely the ingestion of the raw dataset
and few transformations such as assigning Gene names and readable labels to proteins.

## Configuration

Input file (the raw dataset):

In [2]:
INPUT_RAW_DATA = 'data/Chip_MS_H3K4me1_H3K4me3_2022_unnorm.xlsx'

Output directory:

In [3]:
import pathlib
OUTPUT_DIRECTORY = pathlib.Path('outputs') / '01-extracting'

if not OUTPUT_DIRECTORY.is_dir():
    OUTPUT_DIRECTORY.mkdir(parents=True)

Parameters, constants

## Reading Excel

We first load the data:

In [4]:
data = pd.read_excel(
    INPUT_RAW_DATA,
    index_col=0
)
data

Unnamed: 0_level_0,Description,Coverage [%],# Peptides,# PSMs,# Unique Peptides,H3_1,H3_2,H3_3,H4_1,H4_2,H4_3,H3K4me3_1,H3K4me3_2,H3K4me3_3,H3K4me1_1,H3K4me1_2,H3K4me1_3
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Q04446,"1,4-alpha-glucan-branching enzyme OS=Homo sapi...",3,2,2,2,,,,,6.244600e+04,,,6.454938e+04,,,,
Q15029,116 kDa U5 small nuclear ribonucleoprotein com...,23,15,44,14,2.522426e+05,1.730602e+05,1.584524e+05,1.189396e+05,6.677202e+05,4.115486e+04,5.686581e+05,2.978815e+05,5.589060e+05,1.053073e+06,,1.746669e+05
P31946,14-3-3 protein beta/alpha OS=Homo sapiens OX=9...,43,9,44,4,2.472521e+05,3.426881e+05,9.039318e+05,2.274400e+05,5.873763e+05,2.250794e+05,3.055163e+05,4.152821e+05,8.572419e+05,8.089974e+05,4.741039e+05,1.421636e+05
P62258,14-3-3 protein epsilon OS=Homo sapiens OX=9606...,49,11,53,8,3.443775e+05,1.929841e+05,,3.170019e+05,3.192067e+05,4.202944e+05,2.851882e+05,3.572551e+05,7.482891e+05,4.671955e+05,3.569694e+05,4.508984e+05
Q04917,14-3-3 protein eta OS=Homo sapiens OX=9606 GN=...,42,9,38,5,1.238786e+08,1.892759e+08,5.117897e+08,6.823328e+07,1.175261e+08,7.435374e+07,8.311959e+07,8.932719e+07,1.318390e+08,5.130819e+07,7.274683e+07,1.709113e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q2TB10,Zinc finger protein 800 OS=Homo sapiens OX=960...,9,4,9,4,1.516332e+06,9.484857e+05,7.395190e+05,2.055031e+05,7.753342e+05,5.426501e+05,2.734432e+06,1.524918e+06,3.320131e+06,1.976328e+06,1.619403e+06,2.126770e+06
O95218,Zinc finger Ran-binding domain-containing prot...,10,4,6,4,1.354622e+05,1.981103e+05,,,2.034418e+05,1.775103e+05,8.948932e+04,1.117538e+05,4.396195e+05,3.247958e+05,,
Q96KR1,Zinc finger RNA-binding protein OS=Homo sapien...,4,2,3,2,,,,3.751825e+05,3.159557e+05,,,6.455933e+05,1.824670e+06,1.449104e+06,7.430923e+05,6.052774e+05
P25311,Zinc-alpha-2-glycoprotein OS=Homo sapiens OX=9...,18,4,36,4,7.092820e+06,7.777356e+06,1.442498e+07,4.493467e+06,6.012604e+06,5.489928e+06,4.363090e+06,5.296675e+06,8.245945e+06,9.541946e+06,1.197661e+07,1.351196e+07


### Parsing Gene Names

Now we attempt to parse the gene names from the `Description` Column in the data.

Particularly, the description column contains the following information:

In [5]:
data['Description'].head()

Accession
Q04446    1,4-alpha-glucan-branching enzyme OS=Homo sapi...
Q15029    116 kDa U5 small nuclear ribonucleoprotein com...
P31946    14-3-3 protein beta/alpha OS=Homo sapiens OX=9...
P62258    14-3-3 protein epsilon OS=Homo sapiens OX=9606...
Q04917    14-3-3 protein eta OS=Homo sapiens OX=9606 GN=...
Name: Description, dtype: object

E.g. `Accession=Q04446` has the following description:

```
1,4-alpha-glucan-branching enzyme OS=Homo sapiens OX=9606 GN=GBE1 PE=1 SV=3
```

In this entry we see the segment `GN=GBE1` which tells us that the gene name of this protein is `GBE1`

Note that most proteins have this gene name, encoded in the `GN` portion of the Description except:

In [6]:
data[~data['Description'].str.contains('GN')]['Description'].unique()

array(['Immunoglobulin kappa light chain OS=Homo sapiens OX=9606 PE=1 SV=1'],
      dtype=object)

It is therefore quite safe to parse the gene names below, as long as we deal with this one exception above.

The function `parse_gn` does that:

In [7]:
import re
def parse_gn(description):
    """
    Parses the gene name from `GN=ABCD` like string in the description
    """
    
    match = re.match('.*GN=(?P<gene_name>.*?)[A-Z][A-Z]=', description)
    if match:
        return match.group('gene_name').strip()
    else:
        return None
data['Gene'] = data['Description'].map(parse_gn)

As expected, the parsing failed for only one gene:

In [8]:
data[data['Gene'].isnull()]

Unnamed: 0_level_0,Description,Coverage [%],# Peptides,# PSMs,# Unique Peptides,H3_1,H3_2,H3_3,H4_1,H4_2,H4_3,H3K4me3_1,H3K4me3_2,H3K4me3_3,H3K4me1_1,H3K4me1_2,H3K4me1_3,Gene
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
P0DOX7,Immunoglobulin kappa light chain OS=Homo sapie...,14,2,11,2,597716.0,524690.625,718430.9375,599742.140625,365873.15625,641033.242188,995162.0,961910.1875,1599727.5,1836154.0,1304328.0,2307713.0,


To account for the exception, we create another column `Label`, which will be set to `Gene`, when the Gene is present and to `Accesion`, when it is not.

In [9]:
data['Label'] = data['Gene'].copy() # set to Gene
# Where gene is null; set to the Accession (i.e. index)
data.loc[data['Gene'].isnull(), 'Label'] = data.loc[data['Gene'].isnull()].index

We now have the scenario that Label column is never null (as we would expect):

In [10]:
assert not data['Label'].isnull().any()

But unfortunately, some labels are duplicated (as some gene names are duplicated):

In [11]:
data['Label'].duplicated().any()

True

In [12]:
indices_of_duplicated_labels = data[data['Label'].duplicated(keep=False)].index
data.loc[indices_of_duplicated_labels]

Unnamed: 0_level_0,Description,Coverage [%],# Peptides,# PSMs,# Unique Peptides,H3_1,H3_2,H3_3,H4_1,H4_2,H4_3,H3K4me3_1,H3K4me3_2,H3K4me3_3,H3K4me1_1,H3K4me1_2,H3K4me1_3,Gene,Label
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
P42166,"Lamina-associated polypeptide 2, isoform alpha...",23,11,78,2,465867.9,,996752.6,1081027.0,864604.2,972623.375,506054.0,348692.5625,471762.9,551986.3,605070.3,667623.875,TMPO,TMPO
P42167,"Lamina-associated polypeptide 2, isoforms beta...",48,15,112,6,6119934.0,4743520.0,6311578.0,2919314.0,3508533.0,2989788.0,2284912.0,391135.458333,2962563.0,2692815.0,6932885.0,2810912.0,TMPO,TMPO


In such cases we will just add suffixes `(1)` and `(2)` to the duplicated genes, using the convention that the gene with higher coverage gets a lower number:

In [13]:
renames = {}
for label, subdata in data.loc[indices_of_duplicated_labels].groupby('Label'):
    
    # As a convention, higher Coverage [%] gets lower number
    subdata = subdata.sort_values(by='Coverage [%]', ascending=False)
    
    for i, ix in enumerate(subdata.index, start=1):
        renames[ix] = '{} ({})'.format(subdata.loc[ix, 'Label'], i)
        
for ix, new_label in renames.items():
    data.loc[ix, 'Label'] = new_label

The duplicated labels now look like this:

In [14]:
data.loc[indices_of_duplicated_labels, 'Label']

Accession
P42166    TMPO (2)
P42167    TMPO (1)
Name: Label, dtype: object

Which makes all labels to be unique:

In [15]:
assert not data['Label'].duplicated().any()

And therefore we can set a natural index to our data, i.e. the Label column:

In [16]:
data = data.reset_index().set_index('Label')
data

Unnamed: 0_level_0,Accession,Description,Coverage [%],# Peptides,# PSMs,# Unique Peptides,H3_1,H3_2,H3_3,H4_1,H4_2,H4_3,H3K4me3_1,H3K4me3_2,H3K4me3_3,H3K4me1_1,H3K4me1_2,H3K4me1_3,Gene
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
GBE1,Q04446,"1,4-alpha-glucan-branching enzyme OS=Homo sapi...",3,2,2,2,,,,,6.244600e+04,,,6.454938e+04,,,,,GBE1
EFTUD2,Q15029,116 kDa U5 small nuclear ribonucleoprotein com...,23,15,44,14,2.522426e+05,1.730602e+05,1.584524e+05,1.189396e+05,6.677202e+05,4.115486e+04,5.686581e+05,2.978815e+05,5.589060e+05,1.053073e+06,,1.746669e+05,EFTUD2
YWHAB,P31946,14-3-3 protein beta/alpha OS=Homo sapiens OX=9...,43,9,44,4,2.472521e+05,3.426881e+05,9.039318e+05,2.274400e+05,5.873763e+05,2.250794e+05,3.055163e+05,4.152821e+05,8.572419e+05,8.089974e+05,4.741039e+05,1.421636e+05,YWHAB
YWHAE,P62258,14-3-3 protein epsilon OS=Homo sapiens OX=9606...,49,11,53,8,3.443775e+05,1.929841e+05,,3.170019e+05,3.192067e+05,4.202944e+05,2.851882e+05,3.572551e+05,7.482891e+05,4.671955e+05,3.569694e+05,4.508984e+05,YWHAE
YWHAH,Q04917,14-3-3 protein eta OS=Homo sapiens OX=9606 GN=...,42,9,38,5,1.238786e+08,1.892759e+08,5.117897e+08,6.823328e+07,1.175261e+08,7.435374e+07,8.311959e+07,8.932719e+07,1.318390e+08,5.130819e+07,7.274683e+07,1.709113e+08,YWHAH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF800,Q2TB10,Zinc finger protein 800 OS=Homo sapiens OX=960...,9,4,9,4,1.516332e+06,9.484857e+05,7.395190e+05,2.055031e+05,7.753342e+05,5.426501e+05,2.734432e+06,1.524918e+06,3.320131e+06,1.976328e+06,1.619403e+06,2.126770e+06,ZNF800
ZRANB2,O95218,Zinc finger Ran-binding domain-containing prot...,10,4,6,4,1.354622e+05,1.981103e+05,,,2.034418e+05,1.775103e+05,8.948932e+04,1.117538e+05,4.396195e+05,3.247958e+05,,,ZRANB2
ZFR,Q96KR1,Zinc finger RNA-binding protein OS=Homo sapien...,4,2,3,2,,,,3.751825e+05,3.159557e+05,,,6.455933e+05,1.824670e+06,1.449104e+06,7.430923e+05,6.052774e+05,ZFR
AZGP1,P25311,Zinc-alpha-2-glycoprotein OS=Homo sapiens OX=9...,18,4,36,4,7.092820e+06,7.777356e+06,1.442498e+07,4.493467e+06,6.012604e+06,5.489928e+06,4.363090e+06,5.296675e+06,8.245945e+06,9.541946e+06,1.197661e+07,1.351196e+07,AZGP1


### Splitting numeric data and metadata

At this point it makes sense to split the data into numeric columns and the remaining metadatada

In [17]:
data_numeric = data[[
    'H3_1','H3_2','H3_3',
    'H4_1','H4_2','H4_3',
    'H3K4me3_1','H3K4me3_2',
    'H3K4me3_3','H3K4me1_1',
    'H3K4me1_2','H3K4me1_3'
]]

data_metadata = data[data.columns.difference(data_numeric.columns)]

In [18]:
data_numeric

Unnamed: 0_level_0,H3_1,H3_2,H3_3,H4_1,H4_2,H4_3,H3K4me3_1,H3K4me3_2,H3K4me3_3,H3K4me1_1,H3K4me1_2,H3K4me1_3
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
GBE1,,,,,6.244600e+04,,,6.454938e+04,,,,
EFTUD2,2.522426e+05,1.730602e+05,1.584524e+05,1.189396e+05,6.677202e+05,4.115486e+04,5.686581e+05,2.978815e+05,5.589060e+05,1.053073e+06,,1.746669e+05
YWHAB,2.472521e+05,3.426881e+05,9.039318e+05,2.274400e+05,5.873763e+05,2.250794e+05,3.055163e+05,4.152821e+05,8.572419e+05,8.089974e+05,4.741039e+05,1.421636e+05
YWHAE,3.443775e+05,1.929841e+05,,3.170019e+05,3.192067e+05,4.202944e+05,2.851882e+05,3.572551e+05,7.482891e+05,4.671955e+05,3.569694e+05,4.508984e+05
YWHAH,1.238786e+08,1.892759e+08,5.117897e+08,6.823328e+07,1.175261e+08,7.435374e+07,8.311959e+07,8.932719e+07,1.318390e+08,5.130819e+07,7.274683e+07,1.709113e+08
...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF800,1.516332e+06,9.484857e+05,7.395190e+05,2.055031e+05,7.753342e+05,5.426501e+05,2.734432e+06,1.524918e+06,3.320131e+06,1.976328e+06,1.619403e+06,2.126770e+06
ZRANB2,1.354622e+05,1.981103e+05,,,2.034418e+05,1.775103e+05,8.948932e+04,1.117538e+05,4.396195e+05,3.247958e+05,,
ZFR,,,,3.751825e+05,3.159557e+05,,,6.455933e+05,1.824670e+06,1.449104e+06,7.430923e+05,6.052774e+05
AZGP1,7.092820e+06,7.777356e+06,1.442498e+07,4.493467e+06,6.012604e+06,5.489928e+06,4.363090e+06,5.296675e+06,8.245945e+06,9.541946e+06,1.197661e+07,1.351196e+07


In [19]:
data_metadata

Unnamed: 0_level_0,# PSMs,# Peptides,# Unique Peptides,Accession,Coverage [%],Description,Gene
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GBE1,2,2,2,Q04446,3,"1,4-alpha-glucan-branching enzyme OS=Homo sapi...",GBE1
EFTUD2,44,15,14,Q15029,23,116 kDa U5 small nuclear ribonucleoprotein com...,EFTUD2
YWHAB,44,9,4,P31946,43,14-3-3 protein beta/alpha OS=Homo sapiens OX=9...,YWHAB
YWHAE,53,11,8,P62258,49,14-3-3 protein epsilon OS=Homo sapiens OX=9606...,YWHAE
YWHAH,38,9,5,Q04917,42,14-3-3 protein eta OS=Homo sapiens OX=9606 GN=...,YWHAH
...,...,...,...,...,...,...,...
ZNF800,9,4,4,Q2TB10,9,Zinc finger protein 800 OS=Homo sapiens OX=960...,ZNF800
ZRANB2,6,4,4,O95218,10,Zinc finger Ran-binding domain-containing prot...,ZRANB2
ZFR,3,2,2,Q96KR1,4,Zinc finger RNA-binding protein OS=Homo sapien...,ZFR
AZGP1,36,4,4,P25311,18,Zinc-alpha-2-glycoprotein OS=Homo sapiens OX=9...,AZGP1


And this is pretty much everything that we needed to do in this notebook, so let's just save the outputs

In [20]:
data_numeric.to_csv(OUTPUT_DIRECTORY / 'data_numeric.csv')
data_metadata.to_csv(OUTPUT_DIRECTORY / 'data_metadata.csv')