In [None]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
sns.set_palette('Dark2')
sns.set_context('paper')
sns.set_style({'axes.axisbelow': True, 
               'axes.edgecolor': '.15',
               'axes.facecolor': 'white',
               'axes.grid': True, 
               'axes.labelcolor': '.15', 
               'figure.facecolor': 'white', 
               'grid.color': '.15',
               'grid.linestyle': ':', 
               'grid.alpha': .5, 
               'image.cmap': 'Greys', 
               'legend.frameon': False, 
               'legend.numpoints': 1, 
               'legend.scatterpoints': 1,
               'lines.solid_capstyle': 'butt', 
               'axes.spines.right': False, 
               'axes.spines.top': False,  
               'text.color': '.15',  
               'xtick.top': False, 
               'ytick.right': False, 
               'xtick.color': '.15',
               'xtick.direction': 'out', 
               'ytick.color': '.15', 
               'ytick.direction': 'out', 
              })


import matplotlib

FONT_SIZE_PT = 5
matplotlib.rcParams['font.family'] = 'Arial'
matplotlib.rcParams['font.size'] = FONT_SIZE_PT
matplotlib.rcParams['axes.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['axes.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['figure.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['xtick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['ytick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.fontsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.title_fontsize'] = FONT_SIZE_PT

matplotlib.rcParams['xtick.major.size'] = matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = matplotlib.rcParams['ytick.major.width'] = 0.5


matplotlib.rcParams['xtick.minor.size'] = matplotlib.rcParams['ytick.minor.size'] = 1

matplotlib.rcParams['xtick.minor.width'] = matplotlib.rcParams['ytick.minor.width'] = 0.5

matplotlib.rcParams['axes.linewidth'] = 0.5
matplotlib.rcParams['lines.linewidth'] = 0.5
matplotlib.rcParams['grid.linewidth'] = 0.25
matplotlib.rcParams['patch.linewidth'] = 0.25
matplotlib.rcParams['lines.markeredgewidth'] = 0.25
matplotlib.rcParams['lines.markersize'] = 2

FIVE_MM_IN_INCH = 0.19685
DPI = 600
matplotlib.rcParams['figure.figsize'] = (10 * FIVE_MM_IN_INCH, 9 * FIVE_MM_IN_INCH)
matplotlib.rcParams['savefig.dpi'] = DPI
matplotlib.rcParams['figure.dpi'] = DPI // 4


#http://phyletica.org/matplotlib-fonts/
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# (01) Extracting the dataset.

This notebook handles the first step of analysis - namely the ingestion of the raw dataset
and few transformations such as assigning Gene names and readable labels to proteins.

## Configuration

Input file (the raw dataset):

In [None]:
INPUT_RAW_DATA = 'data/Chip_MS_H3K4me1_H3K4me3_2022_unnorm.xlsx'

Output directory:

In [None]:
import pathlib
OUTPUT_DIRECTORY = pathlib.Path('outputs') / '01-extracting'

if not OUTPUT_DIRECTORY.is_dir():
    OUTPUT_DIRECTORY.mkdir(parents=True)

Parameters, constants

## Reading Excel

We first load the data:

In [None]:
data = pd.read_excel(
    INPUT_RAW_DATA,
    index_col=0
)
data

### Parsing Gene Names

Now we attempt to parse the gene names from the `Description` Column in the data.

Particularly, the description column contains the following information:

In [None]:
data['Description'].head()

E.g. `Accession=Q04446` has the following description:

```
1,4-alpha-glucan-branching enzyme OS=Homo sapiens OX=9606 GN=GBE1 PE=1 SV=3
```

In this entry we see the segment `GN=GBE1` which tells us that the gene name of this protein is `GBE1`

Note that most proteins have this gene name, encoded in the `GN` portion of the Description except:

In [None]:
data[~data['Description'].str.contains('GN')]['Description'].unique()

It is therefore quite safe to parse the gene names below, as long as we deal with this one exception above.

The function `parse_gn` does that:

In [None]:
import re
def parse_gn(description):
    """
    Parses the gene name from `GN=ABCD` like string in the description
    """
    
    match = re.match('.*GN=(?P<gene_name>.*?)[A-Z][A-Z]=', description)
    if match:
        return match.group('gene_name').strip()
    else:
        return None
data['Gene'] = data['Description'].map(parse_gn)

As expected, the parsing failed for only one gene:

In [None]:
data[data['Gene'].isnull()]

To account for the exception, we create another column `Label`, which will be set to `Gene`, when the Gene is present and to `Accesion`, when it is not.

In [None]:
data['Label'] = data['Gene'].copy() # set to Gene
# Where gene is null; set to the Accession (i.e. index)
data.loc[data['Gene'].isnull(), 'Label'] = data.loc[data['Gene'].isnull()].index

We now have the scenario that Label column is never null (as we would expect):

In [None]:
assert not data['Label'].isnull().any()

But unfortunately, some labels are duplicated (as some gene names are duplicated):

In [None]:
data['Label'].duplicated().any()

In [None]:
indices_of_duplicated_labels = data[data['Label'].duplicated(keep=False)].index
data.loc[indices_of_duplicated_labels]

In such cases we will just add suffixes `(1)` and `(2)` to the duplicated genes, using the convention that the gene with higher coverage gets a lower number:

In [None]:
renames = {}
for label, subdata in data.loc[indices_of_duplicated_labels].groupby('Label'):
    
    # As a convention, higher Coverage [%] gets lower number
    subdata = subdata.sort_values(by='Coverage [%]', ascending=False)
    
    for i, ix in enumerate(subdata.index, start=1):
        renames[ix] = '{} ({})'.format(subdata.loc[ix, 'Label'], i)
        
for ix, new_label in renames.items():
    data.loc[ix, 'Label'] = new_label

The duplicated labels now look like this:

In [None]:
data.loc[indices_of_duplicated_labels, 'Label']

Which makes all labels to be unique:

In [None]:
assert not data['Label'].duplicated().any()

And therefore we can set a natural index to our data, i.e. the Label column:

In [None]:
data = data.reset_index().set_index('Label')
data

### Splitting numeric data and metadata

At this point it makes sense to split the data into numeric columns and the remaining metadatada

In [None]:
data_numeric = data[[
    'H3_1','H3_2','H3_3',
    'H4_1','H4_2','H4_3',
    'H3K4me3_1','H3K4me3_2',
    'H3K4me3_3','H3K4me1_1',
    'H3K4me1_2','H3K4me1_3'
]]

data_metadata = data[data.columns.difference(data_numeric.columns)]

In [None]:
data_numeric

In [None]:
data_metadata

And this is pretty much everything that we needed to do in this notebook, so let's just save the outputs

In [None]:
data_numeric.to_csv(OUTPUT_DIRECTORY / 'data_numeric.csv')
data_metadata.to_csv(OUTPUT_DIRECTORY / 'data_metadata.csv')