In [1]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
sns.set_palette('Dark2')
sns.set_context('paper')
sns.set_style({'axes.axisbelow': True, 
               'axes.edgecolor': '.15',
               'axes.facecolor': 'white',
               'axes.grid': True, 
               'axes.labelcolor': '.15', 
               'figure.facecolor': 'white', 
               'grid.color': '.15',
               'grid.linestyle': ':', 
               'grid.alpha': .5, 
               'image.cmap': 'Greys', 
               'legend.frameon': False, 
               'legend.numpoints': 1, 
               'legend.scatterpoints': 1,
               'lines.solid_capstyle': 'butt', 
               'axes.spines.right': False, 
               'axes.spines.top': False,  
               'text.color': '.15',  
               'xtick.top': False, 
               'ytick.right': False, 
               'xtick.color': '.15',
               'xtick.direction': 'out', 
               'ytick.color': '.15', 
               'ytick.direction': 'out', 
              })


import matplotlib

FONT_SIZE_PT = 5
matplotlib.rcParams['font.family'] = 'Arial'
matplotlib.rcParams['font.size'] = FONT_SIZE_PT
matplotlib.rcParams['axes.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['axes.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['figure.titlesize'] = FONT_SIZE_PT
matplotlib.rcParams['xtick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['ytick.labelsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.fontsize'] = FONT_SIZE_PT
matplotlib.rcParams['legend.title_fontsize'] = FONT_SIZE_PT

matplotlib.rcParams['xtick.major.size'] = matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = matplotlib.rcParams['ytick.major.width'] = 0.5


matplotlib.rcParams['xtick.minor.size'] = matplotlib.rcParams['ytick.minor.size'] = 1

matplotlib.rcParams['xtick.minor.width'] = matplotlib.rcParams['ytick.minor.width'] = 0.5

matplotlib.rcParams['axes.linewidth'] = 0.5
matplotlib.rcParams['lines.linewidth'] = 0.5
matplotlib.rcParams['grid.linewidth'] = 0.25
matplotlib.rcParams['patch.linewidth'] = 0.25
matplotlib.rcParams['lines.markeredgewidth'] = 0.25
matplotlib.rcParams['lines.markersize'] = 2

FIVE_MM_IN_INCH = 0.19685
DPI = 600
matplotlib.rcParams['figure.figsize'] = (10 * FIVE_MM_IN_INCH, 9 * FIVE_MM_IN_INCH)
matplotlib.rcParams['savefig.dpi'] = DPI
matplotlib.rcParams['figure.dpi'] = DPI // 4


#http://phyletica.org/matplotlib-fonts/
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# (01) Extracting the dataset.

This notebook handles the first step of analysis - namely the ingestion of the raw dataset and few transformations such as assigning Gene names and readable labels to proteins

## Configuration

Input files (raw dataset)

In [2]:
INPUT_RAW_DATA = {
    'long-linkers-enh': 'data/long_linkers_enhancer_PTMs_research_11_2022.xlsx',
    'long-linkers-prom': 'data/long_linkers_promoter_PTMs_research_11_2022.xlsx',
    'short-linkers': 'data/NPD_short_linkers.xlsx'
}

Output directory

In [3]:
import pathlib
OUTPUT_DIRECTORY = pathlib.Path('outputs') / '01-extracting'

if not OUTPUT_DIRECTORY.is_dir():
    OUTPUT_DIRECTORY.mkdir(parents=True)

## Reading Excel Files

In [4]:
datasets = {
    k: pd.read_excel(v, index_col=0) for k,v in INPUT_RAW_DATA.items()
}

data = pd.concat(datasets.values(), keys=datasets.keys())
data.index.names = ['Dataset', data.index.names[1]]
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Description,Coverage [%],# Peptides,# PSMs,# Unique Peptides,# AAs,MW [kDa],Score Mascot: Mascot,# Peptides (by Search Engine): Mascot,H3unmod_50bp_repl1,...,H3K27me3_45bp__3,H3K27me3_50bp__3,H3K27me3_55bp__3,H3unmod_35bp_3,H3unmod_40bp_3,H3unmod_45bp_3,H3unmod_50bp__3,H3unmod_55bp_3,H3K9me3_35bp_3,H3K9me3_40bp_3
Dataset,Accession,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
long-linkers-enh,P35579,Myosin-9 OS=Homo sapiens OX=9606 GN=MYH9 PE=1 ...,71,170,20013,137,1960,226.4,639499,170,2012982000.0,...,,,,,,,,,,
long-linkers-enh,Q13813,"Spectrin alpha chain, non-erythrocytic 1 OS=Ho...",79,205,11500,204,2472,284.4,391743,205,719872000.0,...,,,,,,,,,,
long-linkers-enh,P78527,DNA-dependent protein kinase catalytic subunit...,64,246,12445,246,4128,468.8,377897,246,718968000.0,...,,,,,,,,,,
long-linkers-enh,P60709,"Actin, cytoplasmic 1 OS=Homo sapiens OX=9606 G...",82,30,10826,13,375,41.7,369811,30,17113850000.0,...,,,,,,,,,,
long-linkers-enh,Q01082,"Spectrin beta chain, non-erythrocytic 1 OS=Hom...",79,180,10348,166,2364,274.4,350270,180,744400700.0,...,,,,,,,,,,


### Parsing gene names

Now we attempt to parse the gene names from the Description Column in the data.

Particularly, the description column contains the following information:

In [5]:
data['Description'].head()

Dataset           Accession
long-linkers-enh  P35579       Myosin-9 OS=Homo sapiens OX=9606 GN=MYH9 PE=1 ...
                  Q13813       Spectrin alpha chain, non-erythrocytic 1 OS=Ho...
                  P78527       DNA-dependent protein kinase catalytic subunit...
                  P60709       Actin, cytoplasmic 1 OS=Homo sapiens OX=9606 G...
                  Q01082       Spectrin beta chain, non-erythrocytic 1 OS=Hom...
Name: Description, dtype: object

E.g. Accession=P35579 has the following description:

Myosin-9 OS=Homo sapiens GN=MYH9 PE=1 SV=4
In this entry we see the segment GN=MYH9 which tells us that the gene name of this protein is MYH9

Note that most proteins have this gene name, encoded in the GN portion of the Description (e.g. GN=MYH9) or `Gene_Symbol` portion (e.g. `Gene_symbol=KRT10` except:

In [6]:
data[~(data['Description'].str.contains('GN') | data['Description'].str.contains('Gene_Symbol'))]['Description'].unique()

array(['Streptavidin OS=Streptomyces avidinii PE=1 SV=1',
       'SWISS-PROT:P02769 (Bos taurus) Bovine serum albumin precursor',
       'SWISS-PROT:P15636 Protease I precursor Lysyl endopeptidase Achromobacter lyticus.',
       'Trypsin - Sus scrofa (Pig).',
       'SWISS-PROT:Q28107 (Bos taurus) Coagulation factor V precursor',
       'Golgin subfamily A member 8-like protein 2 OS=Homo sapiens PE=2 SV=2',
       'TREMBL:Q9UE12 Type I hair keratin 1 - Homo sapiens (Human).',
       'SWISS-PROT:Q14525 Keratin, type I cuticular HA3-II (Hair keratin, type I HA3-II)',
       'SWISS-PROT:O76013 Keratin, type I cuticular HA6 (Hair keratin, type I HA6)',
       'SWISS-PROT:P78386 Keratin, type II cuticular Hb5 (Hair keratin, type II Hb5) - Homo sapiens (Human).',
       'SWISS-PROT:Q9NSB2 Keratin, type II cuticular Hb4 (Hair keratin, type II Hb4) - Homo sapiens (Human).',
       'TREMBL:Q8IUT8 Type I hair keratin 4 - Homo sapiens (Human).',
       'SWISS-PROT:Q3SZR3 (Bos taurus) Alpha-1-acid

The above proteins are not particularly important for us and therefore it should be safe to parse the gene names from this columns, as long as we have a fallback for the exceptions above

In [7]:
import re
def parse_gn(description):
    """
    Parses the gene name from `GN=ABCD`, or `Gene_Symbol=ABCD` like string in the description
    """
    
    match = re.match('.*(GN|Gene_Symbol)=(?P<gene_name>.*?)(\s+|$)', description)
    if match:
        return match.group('gene_name').strip()
    else:
        return None
data['Gene'] = data['Description'].map(parse_gn)

In [8]:
data[data['Description'].str.contains('GN')]['Description'].iloc[0]

'Myosin-9 OS=Homo sapiens OX=9606 GN=MYH9 PE=1 SV=4'

In [9]:
data[data['Description'].str.contains('Gene_Symbol')]['Description'].iloc[0]

'SWISS-PROT:P35527 Tax_Id=9606 Gene_Symbol=KRT9 Keratin, type I cytoskeletal 9'

In [10]:
parse_gn('Myosin-9 OS=Homo sapiens OX=9606 GN=MYH9 PE=1 SV=4')

'MYH9'

In [11]:
parse_gn('SWISS-PROT:P35527 Tax_Id=9606 Gene_Symbol=KRT9 Keratin, type I cytoskeletal 9')

'KRT9'

As expected, the parsing failed for only one gene:

In [12]:
data[data['Gene'].isnull()]['Description'].unique()

array(['Streptavidin OS=Streptomyces avidinii PE=1 SV=1',
       'SWISS-PROT:P02769 (Bos taurus) Bovine serum albumin precursor',
       'SWISS-PROT:P15636 Protease I precursor Lysyl endopeptidase Achromobacter lyticus.',
       'Trypsin - Sus scrofa (Pig).',
       'SWISS-PROT:Q28107 (Bos taurus) Coagulation factor V precursor',
       'Golgin subfamily A member 8-like protein 2 OS=Homo sapiens PE=2 SV=2',
       'TREMBL:Q9UE12 Type I hair keratin 1 - Homo sapiens (Human).',
       'SWISS-PROT:Q14525 Keratin, type I cuticular HA3-II (Hair keratin, type I HA3-II)',
       'SWISS-PROT:O76013 Keratin, type I cuticular HA6 (Hair keratin, type I HA6)',
       'SWISS-PROT:P78386 Keratin, type II cuticular Hb5 (Hair keratin, type II Hb5) - Homo sapiens (Human).',
       'SWISS-PROT:Q9NSB2 Keratin, type II cuticular Hb4 (Hair keratin, type II Hb4) - Homo sapiens (Human).',
       'TREMBL:Q8IUT8 Type I hair keratin 4 - Homo sapiens (Human).',
       'SWISS-PROT:Q3SZR3 (Bos taurus) Alpha-1-acid

To account for the exceptions, we create another column Label, which will be set to Gene, when the Gene is present and to Accesion, when it is not.

In [13]:
data['Label'] = data['Gene'].copy() # set to Gene
# Where gene is null; set to the Accession (i.e. index)
data.loc[data['Gene'].isnull(), 'Label'] = [ix[1] for ix in data.loc[data['Gene'].isnull()].index]

We now have the scenario that Label column is never null (as we would expect):

In [14]:
assert not data['Label'].isnull().any()

But unfortunately, some labels may be duplicated (as some gene names are duplicated):

In [15]:
for _dataset, subdf in data.groupby(level='Dataset'):
    
    # Find duplicates:
    subdf_duplicates = subdf[subdf['Label'].duplicated(keep=False)]
    has_duplicates = not subdf_duplicates.empty
    print('Dataset {} has duplicated labels: {}'.format(_dataset, has_duplicates))
    
    if not has_duplicates:
        continue
    
    # Resolve duplicates by adding suffix (1), (2), ... etc.
    for label, duplicate_indices in subdf_duplicates.groupby('Label'):
        duplicate_indices = duplicate_indices.sort_index()
        for i, (ix, row) in enumerate(duplicate_indices.iterrows(), start=1):
            old_label = row['Label']
            new_label = '{} ({})'.format(row['Label'], i)
            
            print(f'Renaming {old_label} ({ix[1]}) to {new_label}') 
            data.loc[ix, 'Label'] = new_label
            
    
    
    
    break
#     for (dataset, label), subduplicates in _duplicates.groupby(['Dataset', 'Label']):
#     subduplicates.sort_index()
#     for ix in enumerate(subduplicates.index, start=1):
#         subdf.loc[ix] = f'{label} ({i})'
    

Dataset long-linkers-enh has duplicated labels: False
Dataset long-linkers-prom has duplicated labels: False
Dataset short-linkers has duplicated labels: True
Renaming CALU (O43852) to CALU (1)
Renaming CALU (O43852-4) to CALU (2)
Renaming CENPV (Q7Z7K6) to CENPV (1)
Renaming CENPV (Q7Z7K6-3) to CENPV (2)
Renaming RBM14 (Q96PK6) to RBM14 (1)
Renaming RBM14 (Q96PK6-5) to RBM14 (2)
Renaming SORBS2 (O94875-11) to SORBS2 (1)
Renaming SORBS2 (O94875-8) to SORBS2 (2)
Renaming SPTAN1 (Q13813-2) to SPTAN1 (1)
Renaming SPTAN1 (Q13813-3) to SPTAN1 (2)
Renaming TPM1 (P09493-3) to TPM1 (1)
Renaming TPM1 (P09493-5) to TPM1 (2)
Renaming TPM4 (P67936) to TPM4 (1)
Renaming TPM4 (P67936-2) to TPM4 (2)


Check that no labels are duplicated any more:

In [16]:
for _dataset, subdf in data.groupby(level='Dataset'):
    
    # Find duplicates:
    subdf_duplicates = subdf[subdf['Label'].duplicated(keep=False)]
    has_duplicates = not subdf_duplicates.empty
    print('Dataset {} has duplicated labels: {}'.format(_dataset, has_duplicates))
    
    assert not has_duplicates

Dataset long-linkers-enh has duplicated labels: False
Dataset long-linkers-prom has duplicated labels: False
Dataset short-linkers has duplicated labels: False


And therefore we can now set a natural index to our data, i.e. the Label column:

In [17]:
data = data.reset_index().set_index(['Dataset', 'Label'])
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Accession,Description,Coverage [%],# Peptides,# PSMs,# Unique Peptides,# AAs,MW [kDa],Score Mascot: Mascot,# Peptides (by Search Engine): Mascot,...,H3K27me3_50bp__3,H3K27me3_55bp__3,H3unmod_35bp_3,H3unmod_40bp_3,H3unmod_45bp_3,H3unmod_50bp__3,H3unmod_55bp_3,H3K9me3_35bp_3,H3K9me3_40bp_3,Gene
Dataset,Label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
long-linkers-enh,MYH9,P35579,Myosin-9 OS=Homo sapiens OX=9606 GN=MYH9 PE=1 ...,71,170,20013,137,1960,226.4,639499,170,...,,,,,,,,,,MYH9
long-linkers-enh,SPTAN1,Q13813,"Spectrin alpha chain, non-erythrocytic 1 OS=Ho...",79,205,11500,204,2472,284.4,391743,205,...,,,,,,,,,,SPTAN1
long-linkers-enh,PRKDC,P78527,DNA-dependent protein kinase catalytic subunit...,64,246,12445,246,4128,468.8,377897,246,...,,,,,,,,,,PRKDC
long-linkers-enh,ACTB,P60709,"Actin, cytoplasmic 1 OS=Homo sapiens OX=9606 G...",82,30,10826,13,375,41.7,369811,30,...,,,,,,,,,,ACTB
long-linkers-enh,SPTBN1,Q01082,"Spectrin beta chain, non-erythrocytic 1 OS=Hom...",79,180,10348,166,2364,274.4,350270,180,...,,,,,,,,,,SPTBN1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
short-linkers,PPIH,O43447,Peptidyl-prolyl cis-trans isomerase H OS=Homo ...,11,2,2,2,177,19.2,20,2,...,5108742.5,5609398.75,6516111.25,5.161312e+06,4.887423e+06,3992957.0,5.466729e+06,6.087354e+06,6.976574e+06,PPIH
short-linkers,REST,Q13127,RE1-silencing transcription factor OS=Homo sap...,3,2,2,2,1097,121.8,19,2,...,2139342.0,1965312.00,3654041.50,4.037978e+06,2.677056e+06,3967469.5,1.667799e+06,3.378881e+06,4.286108e+06,REST
short-linkers,ZMYM2,Q9UBW7,Zinc finger MYM-type protein 2 OS=Homo sapiens...,3,2,4,2,1377,154.8,18,2,...,2278582.0,2618469.75,2699438.25,2.219889e+06,2.519002e+06,2935427.5,3.243666e+06,2.416495e+06,3.896194e+06,ZMYM2
short-linkers,RYR2,Q92736,Ryanodine receptor 2 OS=Homo sapiens GN=RYR2 P...,0,2,2,2,4967,564.2,15,2,...,85275424.0,79028800.00,52530084.00,6.226795e+07,6.171508e+07,55644912.0,6.763379e+07,7.907600e+07,9.109008e+07,RYR2


Splitting numeric data and metadata

At this point it makes sense to split the data into numeric columns and the remaining metadata.
This makes more sense to do for each dataset separately

In [18]:
NUMERIC_COLUMNS = {
     'long-linkers-enh': [
       'H3unmod_50bp_repl1',
     'H3K27ac_50bp_repl1',
     'H3K27ac_200bp_scr_repl1',
     'H3K27ac_200bp_SV40enh_repl1',
     'H3K4me1K27ac_50bp_repl1',
     'H3K4me1K27ac_200bp_scr_repl1',
     'H3K4me1K27ac_200bp_SV40enh_repl1',
     'H3unmod_200bp_scr_repl1',
     'H3unmod_200bp_SV40enh_repl1',
     'H3K4me1_50bp_repl1',
     'H3K4me1_200bp_scr_repl1',
     'H3K4me1_200bp_SV40enh_repl1',
     'H3unmod_50bp_repl2',
     'H3K27ac_50bp_repl2',
     'H3K27ac_200bp_scr_repl2',
     'H3K27ac_200bp_SV40enh_repl2',
     'H3K4me1K27ac_50bp_repl2',
     'H3K4me1K27ac_200bp_scr_repl2',
     'H3K4me1K27ac_200bp_SV40enh_repl2',
     'H3unmod_200bp_scr_repl2',
     'H3unmod_200bp_SV40enh_repl2',
     'H3K4me1_50bp_repl2',
     'H3K4me1_200bp_scr_repl2',
     'H3K4me1_200bp_SV40enh_repl2',
     'H3unmod_50bp_repl3',
     'H3K27ac_50bp_repl3',
     'H3K27ac_200bp_scr_repl3',
     'H3K27ac_200bp_SV40enh_repl3',
     'H3K4me1K27ac_50bp_repl3',
     'H3K4me1K27ac_200bp_scr_repl3',
     'H3K4me1K27ac_200bp_SV40enh_repl3',
     'H3unmod_200bp_scr_repl3',
     'H3unmod_200bp_SV40enh_repl3',
     'H3K4me1_50bp_repl3',
     'H3K4me1_200bp_scr_repl3',
     'H3K4me1_200bp_SV40enh_repl3',
    ],
    'long-linkers-prom': [
         'Promoter_PTMs_50bp_repl1',
         'Promoter_PTMs_200bp_scr_repl1',
         'Promoter_PTMs_200bp_SV40prom_repl1',
         'unmodH3_unmodH4_50bp_repl1',
         'unmodH3_unmodH5_200bp_scr_repl1',
         'unmodH3_unmodH6_200bp_SV40prom_repl1',
         'Promoter_PTMs_50bp_repl2',
         'Promoter_PTMs_200bp_scr_repl2',
         'Promoter_PTMs_200bp_SV40prom_repl2',
         'unmodH3_unmodH4_50bp_repl2',
         'unmodH3_unmodH5_200bp_scr_repl2',
         'unmodH3_unmodH6_200bp_SV40prom_repl2',
         'Promoter_PTMs_50bp_repl3',
         'Promoter_PTMs_200bp_scr_repl3',
         'Promoter_PTMs_200bp_SV40prom_repl3',
         'unmodH3_unmodH4_50bp_repl3',
         'unmodH3_unmodH5_200bp_scr_repl3',
         'unmodH3_unmodH6_200bp_SV40prom_repl3',
    ],
    'short-linkers': [
       'H3K9me3_45bp__1', 'H3K9me3_50bp__1', 'H3K9me3_55bp__1',
       'H3K27me3_35bp__1', 'H3K27me3_40bp__1', 'H3K27me3_45bp__1',
       'H3K27me3_50bp__1', 'H3K27me3_55bp__1', 'H3unmod_35bp_1',
       'H3unmod_40bp_1', 'H3unmod_45bp_1', 'H3unmod_50bp__1', 'H3unmod_55bp_1',
       'H3K9me3_35bp_1', 'H3K9me3_40bp_1', 'H3K9me3_45bp__2',
       'H3K9me3_50bp__2', 'H3K9me3_55bp__2', 'H3K27me3_35bp__2',
       'H3K27me3_40bp__2', 'H3K27me3_45bp__2', 'H3K27me3_50bp__2',
       'H3K27me3_55bp__2', 'H3unmod_35bp_2', 'H3unmod_40bp_2',
       'H3unmod_45bp_2', 'H3unmod_50bp__2', 'H3unmod_55bp_2', 'H3K9me3_35bp_2',
       'H3K9me3_40bp_2', 'H3K9me3_45bp__3', 'H3K9me3_50bp__3',
       'H3K9me3_55bp__3', 'H3K27me3_35bp__3', 'H3K27me3_40bp__3',
       'H3K27me3_45bp__3', 'H3K27me3_50bp__3', 'H3K27me3_55bp__3',
       'H3unmod_35bp_3', 'H3unmod_40bp_3', 'H3unmod_45bp_3', 'H3unmod_50bp__3',
       'H3unmod_55bp_3', 'H3K9me3_35bp_3', 'H3K9me3_40bp_3'
    ],
}

In [19]:
RENAMES = {
    # Fix some typos in column names
    'long-linkers-prom': {
      
        'unmodH3_unmodH5_200bp_scr_repl1': 'unmodH3_unmodH4_200bp_scr_repl1',
        'unmodH3_unmodH6_200bp_SV40prom_repl1': 'unmodH3_unmodH4_200bp_SV40prom_repl1',
        'unmodH3_unmodH5_200bp_scr_repl2': 'unmodH3_unmodH4_200bp_scr_repl2',
        'unmodH3_unmodH6_200bp_SV40prom_repl2': 'unmodH3_unmodH4_200bp_SV40prom_repl2',
        'unmodH3_unmodH5_200bp_scr_repl3': 'unmodH3_unmodH4_200bp_scr_repl3',
        'unmodH3_unmodH6_200bp_SV40prom_repl3': 'unmodH3_unmodH4_200bp_SV40prom_repl3',
        
    }
}

In [20]:
data_numeric = {}
data_metadata = {}

for _dataset in INPUT_RAW_DATA:
    print(_dataset)
    subdf = data.loc[_dataset].dropna(axis=1, how='all')
    renames = RENAMES.get(_dataset, {})
    
    subdf_numeric = subdf[NUMERIC_COLUMNS[_dataset]].rename(columns=renames)
    subdf_metadata = subdf[subdf.columns.difference(subdf_numeric.columns)]
    
    data_numeric[_dataset] = subdf_numeric
    data_metadata[_dataset] = subdf_metadata
    

long-linkers-enh
long-linkers-prom
short-linkers


In [21]:
data_numeric['long-linkers-enh'].head()

Unnamed: 0_level_0,H3unmod_50bp_repl1,H3K27ac_50bp_repl1,H3K27ac_200bp_scr_repl1,H3K27ac_200bp_SV40enh_repl1,H3K4me1K27ac_50bp_repl1,H3K4me1K27ac_200bp_scr_repl1,H3K4me1K27ac_200bp_SV40enh_repl1,H3unmod_200bp_scr_repl1,H3unmod_200bp_SV40enh_repl1,H3K4me1_50bp_repl1,...,H3K27ac_200bp_scr_repl3,H3K27ac_200bp_SV40enh_repl3,H3K4me1K27ac_50bp_repl3,H3K4me1K27ac_200bp_scr_repl3,H3K4me1K27ac_200bp_SV40enh_repl3,H3unmod_200bp_scr_repl3,H3unmod_200bp_SV40enh_repl3,H3K4me1_50bp_repl3,H3K4me1_200bp_scr_repl3,H3K4me1_200bp_SV40enh_repl3
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MYH9,2012982000.0,2435018000.0,2687300000.0,2974732000.0,2815155000.0,2664790000.0,2206769000.0,2937641000.0,1541227000.0,2223824000.0,...,1625335000.0,1808056000.0,1905528000.0,2050247000.0,1285533000.0,2407647000.0,1993102000.0,1377992000.0,2111906000.0,2361874000.0
SPTAN1,719872000.0,961095200.0,1060747000.0,1057862000.0,1056938000.0,969096200.0,779775800.0,1022564000.0,709368500.0,910241000.0,...,655930000.0,778221400.0,811475700.0,811399500.0,524039700.0,973666200.0,806782300.0,703729700.0,821282600.0,936018900.0
PRKDC,718968000.0,802030500.0,1040543000.0,1030530000.0,835442900.0,879056700.0,730706700.0,862981000.0,835770100.0,1062908000.0,...,828498700.0,741144100.0,939763400.0,751969200.0,582989300.0,1002489000.0,761180200.0,604173500.0,833125700.0,843050500.0
ACTB,17113850000.0,20308840000.0,21432260000.0,19608200000.0,20606540000.0,19327970000.0,17434710000.0,21146680000.0,15321960000.0,21797640000.0,...,14254660000.0,18137110000.0,19333760000.0,18457580000.0,12225800000.0,21276570000.0,17723080000.0,17326590000.0,15048220000.0,20330820000.0
SPTBN1,744400700.0,1283232000.0,1334341000.0,1233183000.0,1224636000.0,1386340000.0,1237537000.0,793893700.0,585839400.0,882657300.0,...,1026505000.0,1317820000.0,1265981000.0,2981148000.0,964983900.0,962903500.0,802060000.0,626747200.0,4881266000.0,842454900.0


In [22]:
data_numeric['long-linkers-prom'].head()

Unnamed: 0_level_0,Promoter_PTMs_50bp_repl1,Promoter_PTMs_200bp_scr_repl1,Promoter_PTMs_200bp_SV40prom_repl1,unmodH3_unmodH4_50bp_repl1,unmodH3_unmodH4_200bp_scr_repl1,unmodH3_unmodH4_200bp_SV40prom_repl1,Promoter_PTMs_50bp_repl2,Promoter_PTMs_200bp_scr_repl2,Promoter_PTMs_200bp_SV40prom_repl2,unmodH3_unmodH4_50bp_repl2,unmodH3_unmodH4_200bp_scr_repl2,unmodH3_unmodH4_200bp_SV40prom_repl2,Promoter_PTMs_50bp_repl3,Promoter_PTMs_200bp_scr_repl3,Promoter_PTMs_200bp_SV40prom_repl3,unmodH3_unmodH4_50bp_repl3,unmodH3_unmodH4_200bp_scr_repl3,unmodH3_unmodH4_200bp_SV40prom_repl3
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
MYH9,1281909000.0,2134530000.0,2497327000.0,2281093000.0,2617519000.0,2035759000.0,2860328000.0,1788147000.0,1796727000.0,2444018000.0,3461079000.0,1853011000.0,2199156000.0,1279729000.0,2383846000.0,2562314000.0,2035037000.0,1776609000.0
SPTAN1,524891100.0,658741100.0,835738200.0,858530900.0,880925100.0,744848900.0,991336800.0,543429800.0,553509300.0,921034100.0,1152325000.0,727173800.0,813484500.0,648477900.0,692914400.0,903471900.0,842798800.0,792537800.0
PRKDC,727043000.0,550639900.0,716430300.0,809648100.0,880042700.0,615931500.0,654561100.0,377837100.0,405702900.0,801876700.0,822381400.0,493314400.0,585910800.0,1037794000.0,698567600.0,675647200.0,607981000.0,618179500.0
SPTBN1,721178700.0,1222698000.0,1586194000.0,839236000.0,984979600.0,523645200.0,1537787000.0,1079691000.0,1061178000.0,748477200.0,1188865000.0,549636400.0,1151616000.0,1037424000.0,1468142000.0,762687400.0,786591700.0,827201600.0
ACTB,12696550000.0,17200740000.0,19228480000.0,18891110000.0,20858010000.0,17926950000.0,22134780000.0,13237390000.0,12559520000.0,22009360000.0,25678430000.0,14875290000.0,19534460000.0,14169610000.0,17650190000.0,21132110000.0,19219240000.0,18205000000.0


In [23]:
data_numeric['short-linkers'].head()

Unnamed: 0_level_0,H3K9me3_45bp__1,H3K9me3_50bp__1,H3K9me3_55bp__1,H3K27me3_35bp__1,H3K27me3_40bp__1,H3K27me3_45bp__1,H3K27me3_50bp__1,H3K27me3_55bp__1,H3unmod_35bp_1,H3unmod_40bp_1,...,H3K27me3_45bp__3,H3K27me3_50bp__3,H3K27me3_55bp__3,H3unmod_35bp_3,H3unmod_40bp_3,H3unmod_45bp_3,H3unmod_50bp__3,H3unmod_55bp_3,H3K9me3_35bp_3,H3K9me3_40bp_3
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MYH9,6819964000.0,3963794000.0,2897327000.0,2612789000.0,3547799000.0,5158553000.0,4680400000.0,3737984000.0,5536170000.0,4129462000.0,...,4917671000.0,3194571000.0,2376342000.0,3601253000.0,3044944000.0,2370098000.0,2235995000.0,2117592000.0,2953362000.0,3526135000.0
CONT_P22629,167860800000.0,99681950000.0,80499410000.0,84287460000.0,116885900000.0,135194200000.0,120474500000.0,108332600000.0,165219400000.0,171815600000.0,...,154311800000.0,163570900000.0,149096100000.0,125069800000.0,131570500000.0,147674300000.0,150122300000.0,143406500000.0,110287400000.0,173428300000.0
SPTAN1 (1),411537100.0,304932900.0,301096800.0,239264200.0,325926500.0,297919900.0,354183200.0,317595700.0,377629300.0,436802800.0,...,315458000.0,274318500.0,213617200.0,243176100.0,297563300.0,208509700.0,221492600.0,216614300.0,275257500.0,290491600.0
SPTAN1 (2),96864310.0,69740940.0,105699500.0,21271340.0,30497790.0,36688130.0,27166160.0,97953200.0,37727290.0,28148810.0,...,63293830.0,13682550.0,30014790.0,21262390.0,12736560.0,39965280.0,36437550.0,27816900.0,21269480.0,27142640.0
SPTBN1,842317000.0,427336900.0,377959000.0,311699400.0,430368400.0,563914000.0,523953600.0,435457000.0,686263400.0,574496600.0,...,572594200.0,360493400.0,289056400.0,409911300.0,376759600.0,309650400.0,313065300.0,289716000.0,352308000.0,448546900.0


In [24]:
data_metadata['long-linkers-enh'].head()

Unnamed: 0_level_0,# AAs,# PSMs,# Peptides,# Peptides (by Search Engine): Mascot,# Unique Peptides,Accession,Coverage [%],Description,Gene,MW [kDa],Score Mascot: Mascot
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
MYH9,1960,20013,170,170,137,P35579,71,Myosin-9 OS=Homo sapiens OX=9606 GN=MYH9 PE=1 ...,MYH9,226.4,639499
SPTAN1,2472,11500,205,205,204,Q13813,79,"Spectrin alpha chain, non-erythrocytic 1 OS=Ho...",SPTAN1,284.4,391743
PRKDC,4128,12445,246,246,246,P78527,64,DNA-dependent protein kinase catalytic subunit...,PRKDC,468.8,377897
ACTB,375,10826,30,30,13,P60709,82,"Actin, cytoplasmic 1 OS=Homo sapiens OX=9606 G...",ACTB,41.7,369811
SPTBN1,2364,10348,180,180,166,Q01082,79,"Spectrin beta chain, non-erythrocytic 1 OS=Hom...",SPTBN1,274.4,350270


In [25]:
data_metadata['long-linkers-prom'].head()

Unnamed: 0_level_0,# AAs,# PSMs,# Peptides,# Peptides (by Search Engine): Mascot,# Unique Peptides,Accession,Coverage [%],Description,Gene,MW [kDa],Score Mascot: Mascot,unmodH3_unmodH5_200bp_scr_repl1,unmodH3_unmodH5_200bp_scr_repl2,unmodH3_unmodH5_200bp_scr_repl3,unmodH3_unmodH6_200bp_SV40prom_repl1,unmodH3_unmodH6_200bp_SV40prom_repl2,unmodH3_unmodH6_200bp_SV40prom_repl3
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
MYH9,1960,9223,158,158,137,P35579,69,Myosin-9 OS=Homo sapiens OX=9606 GN=MYH9 PE=1 ...,MYH9,226.4,297789,2617519000.0,3461079000.0,2035037000.0,2035759000.0,1853011000.0,1776609000.0
SPTAN1,2472,5575,203,203,201,Q13813,77,"Spectrin alpha chain, non-erythrocytic 1 OS=Ho...",SPTAN1,284.4,189967,880925100.0,1152325000.0,842798800.0,744848900.0,727173800.0,792537800.0
PRKDC,4128,5822,228,228,228,P78527,55,DNA-dependent protein kinase catalytic subunit...,PRKDC,468.8,175811,880042700.0,822381400.0,607981000.0,615931500.0,493314400.0,618179500.0
SPTBN1,2364,4865,165,165,152,Q01082,75,"Spectrin beta chain, non-erythrocytic 1 OS=Hom...",SPTBN1,274.4,167574,984979600.0,1188865000.0,786591700.0,523645200.0,549636400.0,827201600.0
ACTB,375,4951,29,29,11,P60709,82,"Actin, cytoplasmic 1 OS=Homo sapiens OX=9606 G...",ACTB,41.7,164631,20858010000.0,25678430000.0,19219240000.0,17926950000.0,14875290000.0,18205000000.0


In [26]:
data_metadata['short-linkers'].head()

Unnamed: 0_level_0,# AAs,# PSMs,# Peptides,# Peptides (by Search Engine): Mascot,# Unique Peptides,Accession,Coverage [%],Description,Gene,MW [kDa],Score Mascot: Mascot,calc. pI
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
MYH9,1960,35190,178,178,142,P35579,71,Myosin-9 OS=Homo sapiens GN=MYH9 PE=1 SV=4,MYH9,226.4,1244592,5.6
CONT_P22629,183,13859,9,9,9,CONT_P22629,43,Streptavidin OS=Streptomyces avidinii PE=1 SV=1,,18.8,639027,8.35
SPTAN1 (1),2477,14713,177,177,4,Q13813-2,74,"Isoform 2 of Spectrin alpha chain, non-erythro...",SPTAN1,284.9,543452,5.35
SPTAN1 (2),2452,14454,175,175,2,Q13813-3,74,"Isoform 3 of Spectrin alpha chain, non-erythro...",SPTAN1,282.1,534079,5.34
SPTBN1,2364,12324,148,148,16,Q01082,73,"Spectrin beta chain, non-erythrocytic 1 OS=Hom...",SPTBN1,274.4,455043,5.57


And this is pretty much everything that we needed to do in this notebook, so let's just save the outputs

In [27]:
for k, df in data_numeric.items():
    df.to_csv(OUTPUT_DIRECTORY / f'data_numeric.{k}.csv')
    
for k, df in data_metadata.items():
    df.to_csv(OUTPUT_DIRECTORY / f'data_metadata.{k}.csv')