# Prepare metabolomics data for ipath ingestion
This notebook is dedicated to preparing the VBCF metabolomics facility results for upload to ipath. However, since the data is provided as excel file containing several sheets one needs to do some preprocessing in excel in order to make the data parsable. In particular these preprocessing/reformating steps apply only to the 'Results HILIC' and 'Results RP' sheets since these are the main data sources.

1. Reformat the log2 fold change cells to contain numbers with at least 3 decimal places
2. Reformat the normalized peak area cells to contain numbers with at least 10 decimal places
3. Save each sheet as UTF-8 decoded CSV file

The rest will be done by `parse_metabolomics_results` function which will return two `pandas.DataFrame` objects containing all the data you need (one containing everything that has a KEGG ID and one containing everything without a KEGG ID)

In [1]:
import pandas as pd
import csv
import requests

def get_unambiguous_column_names(column_names):
    series = pd.Series(column_names)
    series[series.duplicated(keep = 'first')] = series[series.duplicated(keep = 'first')].apply(
        lambda x: x + '.1'
    )
    return series.to_list()

def get_ipath_selection(df, identifier, color, width, print_selection = False):
    selection = []
    for i in df[identifier]:
        entry = [i, color, f'W{str(width)}']
        selection.append(entry)
        
        if print_selection:
            print(' '.join(entry))
            
    return selection

def parse_metabolomics_results(filename, delimiter = ',', quotechar = '"'):
    with open(filename) as csvfile:
        data = []
        entryid = None
        csvreader = csv.reader(
            csvfile,
            delimiter = delimiter,
            quotechar = quotechar
        )
        
        # get column names and make them unambiguous
        names = []
        for i in range(2):
            names = csvreader.__next__()
        
        names = get_unambiguous_column_names(names)
        
        add_names = [
                'Name2', 
                'Molecular Weight', 
                'RT [min]2', 
                'DeltaMass [ppm]', 
                'DBID', 
                'Reference List Name', 
                'mzLogic Score', 
                'ChemSpider ID', 
                'KEGG', 'HMDB', 
                'Mass List Search Results ID'
        ]
        add_names_set = set(add_names) # faster check
        series = None
        tmp_names = []
        for line in csvreader:
            if line[0]:
                entryid = line[0]
                series = pd.Series(
                    {k: v for k, v in zip(names, line)}
                )

            elif line[1] == 'Name':
                tmp_names = [name for name in line if name]
            
            elif line[1] == series.Name:
                for k, v in zip(
                    tmp_names,
                    line[1: 1 + len(tmp_names)]
                ):
                    if k in add_names_set:
                        k = k + '2' if k in {'Name', 'RT [min]'} else k
                        series[k] = v if v else None
                    
                    elif k == 'KEGG ID':
                        series['KEGG'] = v if v else None
                    
                data.append(series.copy())
    
    return pd.DataFrame(data, columns = names + add_names)

def clean_dataframe(df):
    kegg_rows = df.KEGG.isna()
    no_kegg = df.loc[kegg_rows, :].copy()
    df = df.loc[~kegg_rows, :].copy()
    df = df.drop_duplicates(subset = ['ID', 'KEGG'])
    return df, no_kegg

In [2]:
hilic = parse_metabolomics_results('../raw/Results_Untargeted_Metabolomics_E14-P40_LK_Nova_results_HILIC.csv')
print(hilic.ID.unique().shape)
hilic, no_kegg_hilic = clean_dataframe(hilic)
hilic

(372,)


Unnamed: 0,ID,Name,Calc. MW,RT [min],"(P40, KO) / (P40, WT)","(P2, KO) / (P2, WT)","(E14_5, KO) / (E14_5, WT)","(P40, WT) / (P2, WT)","(P40, WT) / (E14_5, WT)","(P2, WT) / (E14_5, WT)",...,Molecular Weight,RT [min]2,DeltaMass [ppm],DBID,Reference List Name,mzLogic Score,ChemSpider ID,KEGG,HMDB,Mass List Search Results ID
0,A001_HILIC,Pyruvic acid,88.0160,3.93,-1.32,-0.31,-0.55,0.03,-0.27,-0.30,...,88.0160,,-0.11,B003,iHILIC_neg_2020,68.35,1031,C00022,HMDB0000243,101
1,A002_HILIC,L-(+)-Alanine,89.0477,14.83,0.00,-0.05,-0.04,0.23,-0.34,-0.57,...,89.0477,,0.34,A072,iHILIC_neg_2020,74.93,5735,C00041,HMDB0000161,57
3,A003_HILIC,Sarcosine,89.0477,13.63,0.31,0.04,0.22,-4.07,-4.17,-0.09,...,89.0477,,0.71,A026,iHILIC_pos_2020,36.31,1057,C00213,HMDB0000271,26
5,A004_HILIC,L-(+)-Lactic acid,90.0317,4.48,1.08,0.21,-0.22,-0.28,-0.78,-0.49,...,90.0317,,-0.10,B002,iHILIC_neg_2020,93.20,96860,C00186,HMDB0000190,109
6,A005_HILIC,Glycerin,92.0474,7.10,0.21,-0.06,-0.20,1.64,1.58,-0.06,...,92.0473,,0.17,D034,iHILIC_neg_2020,,733,C00116,HMDB0000131,473
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,B180_HILIC,"α,α-Trehalose",342.1162,15.63,1.90,0.60,0.07,-0.95,-3.74,-2.79,...,342.1162,,-0.19,,,,,C01083,,
237,B183_HILIC,Uridine 5'-diphosphate (UDP),404.0022,18.51,-0.03,-0.04,-0.03,-2.59,-5.55,-2.96,...,404.0022,,0.03,,,,,C00015,,
238,B184_HILIC,Adenosine diphosphate (ADP),427.0296,17.32,-0.17,-0.19,-0.04,-0.58,-2.30,-1.72,...,427.0294,,0.31,,,,,C00008,,
240,B186_HILIC,Uridine 5'-diphosphoglucuronic acid,580.0345,20.93,0.12,-0.19,0.01,-1.59,-2.12,-0.54,...,580.0343,,0.28,,,,,C00167,,


In [3]:
no_kegg_hilic

Unnamed: 0,ID,Name,Calc. MW,RT [min],"(P40, KO) / (P40, WT)","(P2, KO) / (P2, WT)","(E14_5, KO) / (E14_5, WT)","(P40, WT) / (P2, WT)","(P40, WT) / (E14_5, WT)","(P2, WT) / (E14_5, WT)",...,Molecular Weight,RT [min]2,DeltaMass [ppm],DBID,Reference List Name,mzLogic Score,ChemSpider ID,KEGG,HMDB,Mass List Search Results ID
50,A033_HILIC,6-Oxo-pipecolinic acid,143.0583,3.85,0.16,0.77,-0.19,-1.59,-4.09,-2.50,...,143.0582,,0.48,A111,iHILIC_neg_2020,39.46,2282737,,HMDB0061705,578
180,B123_HILIC,2-Hydroxycaproic acid,132.0786,2.50,0.50,-0.25,-0.25,-1.65,-1.87,-0.21,...,132.0786,,-0.42,,,,,,,
183,B127_HILIC,6-Aminonicotinic acid,138.0430,6.88,-0.68,-0.42,-0.42,0.48,-0.62,-1.10,...,138.0429,,0.74,,,,,,,
186,B130_HILIC,DL-Stachydrine,143.0947,9.20,0.18,-0.41,-0.31,1.01,-3.60,-4.61,...,143.0946,,0.46,,,,,,,
194,B139_HILIC,1-Methylguanine,165.0652,11.62,0.32,0.24,0.39,0.25,1.05,0.80,...,165.0651,,0.72,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,c392_HILIC,PS(16:1(9Z)/18:1(9Z)),759.5050,2.13,0.63,0.14,-0.18,-0.81,-3.47,-2.66,...,759.5050,,0.01,,,,,,,
444,c393_HILIC,"PS(18:2(9Z,12Z)/18:2(9Z,12Z))",783.5051,2.12,0.78,0.10,-0.09,-2.19,-3.06,-0.87,...,783.5050,,0.02,,,,,,,
445,c394_HILIC,Phosphatidylserine,791.5676,1.98,0.95,0.84,-0.63,1.50,-1.48,-2.98,...,791.5676,,-0.09,,,,,,,
446,c395_HILIC,1-oleoyl-2-arachidonoyl-sn-glycero-3-phospho-L...,809.5200,2.11,0.80,0.06,-0.06,-0.69,-1.70,-1.02,...,809.5207,,-0.86,,,,,,,


In [4]:
import itertools as it
conditions = ['WT', 'KO']
timepoints = ['E14_5', 'P2', 'P40']
def write_raw_values(df, timepoints, conditions, prefix):
    for sample in it.product(timepoints, conditions):
        sample_cols = hilic.columns[
            df.columns.str.match(
                '^S[0-9]{2}_' + '_'.join(sample)
            )
        ].to_list()

        df.loc[:, ['KEGG'] + sample_cols].to_csv(
            '_'.join([prefix, *sample]) + '.tsv',
            sep = '\t',
            header = False,
            index = False
        )

columns_wt = ['(P2, WT) / (E14_5, WT)', '(P40, WT) / (P2, WT)']
columns_ko = ['(P2, KO) / (E14_5, KO)', '(P40, KO) / (P2, KO)']
def write_foldchanges(df, columns, condition, prefix):
    # writing base values for t0
    tmp = pd.DataFrame(
        get_ipath_selection(
            hilic, 
            'KEGG', 
            '#000000', 
            20
        )
    )
    tmp.to_csv(
        '_'.join([prefix, 'base', condition]) + '.tsv',
        sep = '\t',
        header = False,
        index = False
    )
    
    for column in columns:
        suffix = column[1:-1] \
            .replace(', ', '_') \
            .replace(') / (', '_')
        
        df.loc[:, ['KEGG', column]].to_csv(
            '_'.join([prefix, suffix]) + '.tsv',
            sep = '\t',
            header = False,
            index = False
        )
    
# write_raw_values(
#     hilic,
#     timepoints,
#     conditions,
#     '../raw/metabolomics_hilic'
# )
for condition, columns in zip(
    conditions, 
    [columns_wt, columns_ko]
):
    write_foldchanges(
        hilic,
        columns,
        condition,
        '../raw/metabolomics_hilic'
    )
    
selection = get_ipath_selection(
    hilic, 
    'KEGG', 
    '#000000', 
    20,
    print_selection = True
)

C00022 #000000 W20
C00041 #000000 W20
C00213 #000000 W20
C00186 #000000 W20
C00116 #000000 W20
C01026 #000000 W20
C00334 #000000 W20
C01089 #000000 W20
C00065 #000000 W20
C00519 #000000 W20
C00106 #000000 W20
C00791 #000000 W20
C00148 #000000 W20
C00581 #000000 W20
C00719 #000000 W20
C00188 #000000 W20
C00263 #000000 W20
C07182 #000000 W20
C00153 #000000 W20
C00245 #000000 W20
C01879 #000000 W20
C00408 #000000 W20
C00233 #000000 W20
C02714 #000000 W20
C01157 #000000 W20
C00123 #000000 W20
C00407 #000000 W20
C00152 #000000 W20
C00077 #000000 W20
C00049 #000000 W20
C00262 #000000 W20
C00064 #000000 W20
C00047 #000000 W20
C00025 #000000 W20
C00979 #000000 W20
C00073 #000000 W20
C00242 #000000 W20
C00385 #000000 W20
C00295 #000000 W20
C01551 #000000 W20
C00956 #000000 W20
C00318 #000000 W20
C02989 #000000 W20
C00079 #000000 W20
C00074 #000000 W20
C00366 #000000 W20
C01152 #000000 W20
C00111 #000000 W20
C00093 #000000 W20
C03771 #000000 W20
C00062 #000000 W20
C00327 #000000 W20
C00072 #0000

In [5]:
rp = parse_metabolomics_results('../raw/Results_Untargeted_Metabolomics_E14-P40_LK_Nova_results_RP.csv')
print(rp.ID.unique().shape)
rp, no_kegg_rp = clean_dataframe(rp)
rp

(206,)


Unnamed: 0,ID,Name,Calc. MW,RT [min],"(P40, KO) / (P40, WT)","(P2, KO) / (P2, WT)","(E14_5, KO) / (E14_5, WT)","(P40, WT) / (P2, WT)","(P40, WT) / (E14_5, WT)","(P2, WT) / (E14_5, WT)",...,Molecular Weight,RT [min]2,DeltaMass [ppm],DBID,Reference List Name,mzLogic Score,ChemSpider ID,KEGG,HMDB,Mass List Search Results ID
0,A001_RP,L-(+)-Alanine,89.0478,3.86,0.10,0.14,0.21,0.15,-1.90,-2.05,...,89.0477,,1.09,A072,RP_pos_2020,93.43,5735,C00041,HMDB0000161,71
1,A002_RP,L-a-Amino-n-butyric acid,103.0633,4.05,0.48,0.46,-0.18,1.73,3.06,1.33,...,103.0633,,-0.17,E19,RP_pos_2020,89.90,72524,C02356,HMDB0000452,176
2,A003_RP,Choline,103.0997,3.82,0.08,0.27,0.06,0.55,0.17,-0.38,...,103.0997,,-0.15,B018,RP_pos_2020,,299,C00114,HMDB0000097,72
3,A004_RP,L-Serine,105.0426,3.74,-0.08,0.50,0.18,0.58,0.05,-0.53,...,105.0426,,0.35,A075,RP_neg_2020,91.03,5736,C00065,HMDB0000187,19
4,A005_RP,Hypotaurine,109.0198,3.86,0.18,0.48,0.39,-2.54,-4.31,-1.77,...,109.0198,,0.68,B019,RP_pos_2020,70.96,96959,C00519,HMDB0000965,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,B103_RP,Uridine monophosphate (UMP),324.0360,5.45,-0.13,0.17,-0.01,-1.49,0.01,1.50,...,324.0359,,0.35,,,,,C00105,,
147,B104_RP,3'-Adenosine monophosphate (3'-AMP),347.0631,6.04,-0.07,0.09,0.05,0.75,1.74,0.99,...,347.0631,,0.10,,,,,C01367,,
148,B105_RP,4-Hydroxytamoxifen,387.2198,26.07,0.07,1.94,-0.01,-5.14,-0.56,4.58,...,387.2198,,-0.09,,,,,D06551,,
151,B107_RP,Adenosine diphosphate (ADP),427.0298,4.74,-0.11,-0.16,-0.07,-0.75,-3.17,-2.42,...,427.0294,,0.78,,,,,C00008,,


In [6]:
import requests
def map_compounds_to_ec(compounds):
    response = requests.get(
        'http://rest.kegg.jp/link/enzyme/' + 
        '+'.join(compounds)
    )
    mapping = []
    for string in response.text.split('\n'):
        if string:
            mapping.append(
                [s.split(':')[1] for s in string.split('\t')]
            )
    
    df = pd.DataFrame(
        mapping,
        columns = ['KEGG', 'EC']
    )
    return df

kegg2ec = map_compounds_to_ec(
    hilic.KEGG.to_list()
)
kegg2ec = kegg2ec.merge(
    hilic.loc[:, ['Name', 'KEGG']],
    on = 'KEGG',
    how = 'left'
)
kegg2ec

Unnamed: 0,KEGG,EC,Name
0,C00022,1.2.1.49,Pyruvic acid
1,C00022,1.2.1.51,Pyruvic acid
2,C00022,1.2.3.15,Pyruvic acid
3,C00022,1.2.3.3,Pyruvic acid
4,C00022,1.2.3.6,Pyruvic acid
...,...,...,...
2980,C00002,6.3.2.50,Adenosine 5'-triphosphate (5'-ATP)
2981,C00002,6.3.2.51,Adenosine 5'-triphosphate (5'-ATP)
2982,C00002,6.3.2.26,Adenosine 5'-triphosphate (5'-ATP)
2983,C05122,2.3.1.65,Taurocholic acid


In [7]:
compass = pd.read_csv(
    '../processed/compass_reactions.tsv',
    sep = '\t',
    index_col = 0
)
compass

Unnamed: 0,log2FC,metadata_r_id,ec_number
13DAMPPOX_pos,1.016497,13DAMPPOX,1.4.3.6
2DR1PP_pos,1.011031,2DR1PP,3.1.3.10
2HBO_neg,1.009523,2HBO,1.1.1.27
2HBO_pos,0.999164,2HBO,1.1.1.27
2OXOADOXm_pos,0.997916,2OXOADOXm,2.3.1.61
...,...,...,...
r1487_pos,1.030900,r1487,6.2.1.3
r1488_pos,1.017051,r1488,6.2.1.3
r1492_pos,1.030697,r1492,6.2.1.3
sink_citr(c)_pos,1.019971,sink_citr(c),2.4.99.9


In [8]:
kegg2ec[kegg2ec.EC == '6.2.1.3']

Unnamed: 0,KEGG,EC,Name
1559,C00020,6.2.1.3,Adenosine 5'-monophosphate (5'-AMP)
2763,C00002,6.2.1.3,Adenosine 5'-triphosphate (5'-ATP)


In [29]:
import pickle
def parse_gene_record(gene_record):
    # [:-1] to get rid of the trailing bracket
    try:
        idsym, namekeggec = gene_record[:-1].split(
            '; ',
            maxsplit = 1
        )
        geneid, symbol = idsym.split()
    
    except ValueError:
        idsym, namekeggec = gene_record[:-1].split(maxsplit = 1)
        geneid, symbol = idsym, None
        
    record = {
        'ncbigeneid': geneid,
        'genesymbol': symbol,
    }
    
    try:
        name, keggec = namekeggec.split(
            ' [',
            maxsplit = 1
        )
        for key, val in [s.split(':') for s in keggec.split('] [')]:
            record[key] = val
    
    except ValueError:
        name = namekeggec.strip()
    
    record['name'] = name
        
    return geneid, record


def parse_kegg_pathway(responsetxt):
    keywords = {
        'ENTRY',
        'NAME',
        'DESCRIPTION',
        'CLASS',
        'MODULE',
        'GENE',
        'COMPOUND'
    }
    
    record_parsers = {
        'COMPOUND': lambda x: x.split(maxsplit = 1) if len(x.split(maxsplit = 1)) > 1 else (x.strip(), None),
        'GENE': parse_gene_record
    }
    
    current_key, entries = None, None
    parse_results = {}
    for line in responsetxt.split('\n'):
        if line:
            key = line[:12].strip()
            
        else:
            continue
            
        if key:
            if current_key:
                parse_results[current_key] = entries
                
            current_key = key
            entry = line.split(maxsplit = 1)
            
            if current_key in record_parsers.keys():
                key, val = record_parsers[current_key](entry[1])
                entries = {
                    key: val
                }
            
            else:
                entries = [
                    entry[1] if len(entry) > 1 else ''
                ]
        
        else:
            if current_key in record_parsers.keys():
                key, val = record_parsers[current_key](line.strip()) 
                entries[key] = val
            
            else:
                entries.append(
                    line.strip()
                )
     
    # also save the last bits
    parse_results[current_key] = entries  
    
    # remove anything that is not in keywords
    keys = set(parse_results.keys())
    for key in keys.difference(keywords):
        parse_results.pop(key)
    
    return parse_results


def get_kegg_pathway_maps(taxid = 'mmu'):
    r = requests.get(
        f'http://rest.kegg.jp/list/pathway/{taxid}'
    )
    response_lines = [line.split('\t') for line in r.text.split('\n') if line]
    pathway_map_ids = {
        key.split(':')[1]: val for key, val in response_lines
    }
    
    pathway_maps = {}
    for mapid in pathway_map_ids.keys():
        r = requests.get(
            f'http://rest.kegg.jp/get/{mapid}'
        )
        pathway_maps[mapid] = parse_kegg_pathway(r.text)
        
    return pathway_maps


pathwaymaps = get_kegg_pathway_maps()
with open('../raw/kegg_pathway_maps.pickle', 'wb') as handle:
    pickle.dump(
        pathwaymaps,
        handle
    )

In [9]:
import pickle
with open('../raw/kegg_pathway_maps.pickle', 'rb') as handle:
    pathwaymaps = pickle.load(handle)

In [10]:
def select_maps(pathwaydict, class_contains):
    selection = {}
    for k, v in pathwaydict.items():
        if 'CLASS' in v and any([pattern in v['CLASS'][0].lower() for pattern in class_contains]):
            selection[k] = v
            
    return selection

selected_pathways = select_maps(
    pathwaymaps,
    ['metabolism', 'information processing']
)

In [56]:
def compute_overlaps(diff_compounds, pathways, metabolites = None):
    diff_compounds = set(diff_compounds)
    overlaps = []
    for key, pathway in pathways.items():
        if 'COMPOUND' in pathway:
            pathway_metabolites = set(pathway['COMPOUND'].keys())
            if metabolites:
                pathway_metabolites = pathway_metabolites.intersection(
                    set(metabolites)
                )
            overlaps.append(
                [
                    key, 
                    pathway['NAME'][0],
                    len(
                        diff_compounds.intersection(
                            pathway_metabolites
                        )
                    ),
                    len(pathway_metabolites),
                    list(pathway_metabolites)
                ]
            )
    
    return pd.DataFrame(overlaps, columns = ['mapid', 'name', 'ncommon', 'ncompounds', 'compounds'])

pvalcol = '(P40, WT) / (P2, WT).1'
def get_diff_reg_compounds(df, pvalcol, logfoldcol, alpha = 0.05):
    upregulated = df.loc[
        (df[pvalcol].astype(float) <= alpha) & (df[logfoldcol].astype(float) > 0),
        ['KEGG', 'Name', pvalcol, logfoldcol]
    ]
    downregulated = df.loc[
        (df[pvalcol].astype(float) <= alpha) & (df[logfoldcol].astype(float) < 0),
        ['KEGG', 'Name', pvalcol, logfoldcol]
    ]
    return upregulated, downregulated

upreg, downreg = get_diff_reg_compounds(
    hilic,
    pvalcol,
    pvalcol[:-2]
)
overlaps = compute_overlaps(upreg.KEGG.to_list(), selected_pathways, hilic.KEGG.to_list())
overlaps

Unnamed: 0,mapid,name,ncommon,ncompounds,compounds
0,mmu00010,Glycolysis / Gluconeogenesis - Mus musculus (m...,0,5,"[C00022, C00186, C00111, C00074, C00103]"
1,mmu00020,Citrate cycle (TCA cycle) - Mus musculus (mouse),0,4,"[C00158, C00074, C00022, C00417]"
2,mmu00030,Pentose phosphate pathway - Mus musculus (mouse),2,6,"[C00620, C00345, C00022, C05382, C00257, C00199]"
3,mmu00040,Pentose and glucuronate interconversions - Mus...,2,8,"[C00029, C00167, C00116, C00022, C00111, C0053..."
4,mmu00051,Fructose and mannose metabolism - Mus musculus...,1,3,"[C00096, C00111, C00186]"
...,...,...,...,...,...
102,mmu04310,Wnt signaling pathway - Mus musculus (mouse),0,0,[]
103,mmu04340,Hedgehog signaling pathway - Mus musculus (mouse),0,0,[]
104,mmu04370,VEGF signaling pathway - Mus musculus (mouse),0,0,[]
105,mmu04371,Apelin signaling pathway - Mus musculus (mouse),0,0,[]


In [69]:
from scipy.stats import hypergeom
import numpy as np
def pvalue(k, M, n, N):
    """
    computes the probability to find k or more overlapping genes between two gene sets
    of n and N genes, where n is the number of genes in the gene set to which we overlap
    (i.e. number of genes in a pathway or number of differentially expressed genes in
    another patient) and N is the number of genes in the gene set which we are interested
    in (i.e. number of differentially expressed genes) which are drawn from M total genes
    (i.e. number of genes considered during differential expression analysis). This is also
    known as computing the pvalue for fisher's exact test.
    See http://www.pathwaycommons.org/guide/primers/statistics/fishers_exact_test/ for
    more information on how this is computed
    :param k:   number of common genes between two gene sets
    :param M:   total number of genes considered during DEA
    :param n:   number of genes in gene set with which we overlap
    :param N:   number of differentially expressed genes
    :return:    probability to find k or more common genes between the two gene sets
    """
    # k - 1 because we are computing P(x >= k) which includes k
    pval = hypergeom.sf(
        k - 1, M, n, N
    )
    return pval

def fdrcorrection(df, fdr = 0.05):
    # benjamini-hochberg correction
    # taken from statsmodels.stats.multitest.fdrcorrection
    # https://github.com/statsmodels/statsmodels/blob/main/statsmodels/stats/multitest.py
    df.sort_values(
        by = 'pvalue',
        inplace = True
    )
    df['padj'] = np.arange(1, len(df) + 1) / len(df) * fdr
    reject = (df.pvalue <= df.padj).values
    if reject.any():
        rejectmax = np.nonzero(reject)[0][-1]
        reject[:rejectmax] = True
    
    df['reject'] = reject
    
    return df

In [67]:
M = 396
overlaps['pvalue'] = overlaps.apply(
    lambda x: pvalue(x.ncommon, M, x.ncompounds, len(upreg)),
    axis = 1
)
overlaps = fdrcorrection(overlaps)
overlaps

Unnamed: 0,mapid,name,ncommon,ncompounds,compounds,pvalue,padj,reject
19,mmu00250,"Alanine, aspartate and glutamate metabolism - ...",5,11,"[C00025, C12270, C00022, C03406, C00041, C0015...",0.001275,0.000467,False
16,mmu00230,Purine metabolism - Mus musculus (mouse),6,16,"[C00620, C00130, C00008, C00387, C00242, C0038...",0.001302,0.000935,False
82,mmu02010,ABC transporters - Mus musculus (mouse),8,29,"[C00025, C00245, C00148, C00255, C00378, C0011...",0.001855,0.001402,False
31,mmu00410,beta-Alanine metabolism - Mus musculus (mouse),3,4,"[C00334, C00386, C00049, C00106]",0.002393,0.001869,False
50,mmu00564,Glycerophospholipid metabolism - Mus musculus ...,4,8,"[C00065, C00111, C00670, C00570, C00346, C0011...",0.002797,0.002336,False
...,...,...,...,...,...,...,...,...
44,mmu00531,Glycosaminoglycan degradation - Mus musculus (...,0,0,[],1.000000,0.048131,False
41,mmu00515,Mannose type O-glycan biosynthesis - Mus muscu...,0,1,[C00063],1.000000,0.048598,False
40,mmu00513,Various types of N-glycan biosynthesis - Mus m...,0,0,[],1.000000,0.049065,False
23,mmu00290,"Valine, leucine and isoleucine biosynthesis - ...",0,5,"[C00123, C00022, C00407, C00188, C00233]",1.000000,0.049533,False


In [59]:
overlaps[overlaps.pvalue <= 0.05]

Unnamed: 0,mapid,name,ncommon,ncompounds,compounds,pvalue,padj,reject
19,mmu00250,"Alanine, aspartate and glutamate metabolism - ...",5,11,"[C00025, C12270, C00022, C03406, C00041, C0015...",0.001275,0.000467,False
16,mmu00230,Purine metabolism - Mus musculus (mouse),6,16,"[C00620, C00130, C00008, C00387, C00242, C0038...",0.001302,0.000935,False
82,mmu02010,ABC transporters - Mus musculus (mouse),8,29,"[C00025, C00245, C00148, C00255, C00378, C0011...",0.001855,0.001402,False
31,mmu00410,beta-Alanine metabolism - Mus musculus (mouse),3,4,"[C00334, C00386, C00049, C00106]",0.002393,0.001869,False
50,mmu00564,Glycerophospholipid metabolism - Mus musculus ...,4,8,"[C00065, C00111, C00670, C00570, C00346, C0011...",0.002797,0.002336,False
96,mmu04080,Neuroactive ligand-receptor interaction - Mus ...,4,9,"[C12270, C00025, C00245, C00015, C00212, C0033...",0.004718,0.002804,False
75,mmu00910,Nitrogen metabolism - Mus musculus (mouse),2,2,"[C00025, C00064]",0.007608,0.003271,False
26,mmu00340,Histidine metabolism - Mus musculus (mouse),3,6,"[C00025, C05828, C02835, C01152, C00386, C00049]",0.010554,0.003738,False
36,mmu00480,Glutathione metabolism - Mus musculus (mouse),3,7,"[C00025, C00005, C00077, C00127, C00051, C0007...",0.017349,0.004206,False
15,mmu00220,Arginine biosynthesis - Mus musculus (mouse),3,7,"[C00025, C00077, C00327, C00062, C03406, C0006...",0.017349,0.004673,False


In [73]:
def compute_selections(df, pathways, keggidcol, foldchange_columns, pvalue_columns, alpha = 0.05, fdr = 0.1):
    selections = {}
    for pvalcol, fccol in zip(pvalue_columns, foldchange_columns):
        selection = pd.DataFrame(columns = ['mapid', 'name', 'median_foldchange'])
        for diff_reg_compounds in get_diff_reg_compounds(df, pvalcol, fccol):
            enrichment = compute_overlaps(
                diff_reg_compounds[keggidcol].to_list(),
                pathways,
                df[keggidcol].to_list()
            )
            enrichment['pvalue'] = enrichment.apply(
                lambda x: pvalue(
                    x.ncommon, M, x.ncompounds, len(diff_reg_compounds)
                ),
                axis = 1
            )
            enrichment = fdrcorrection(enrichment, fdr = fdr)
            enrichment = enrichment.loc[enrichment.pvalue <= alpha, :].reset_index(drop = True)
            enrichment['median_foldchange'] = .0
            for i, row in enrichment.iterrows():
                compounds = enrichment.at[i, 'compounds']
                enrichment.loc[i, 'median_foldchange'] = df.loc[df[keggidcol].isin(compounds), fccol].median()
            
            selection = pd.concat(
                [selection, enrichment.loc[:, ['mapid', 'name', 'median_foldchange']]]
            )
        
        selections[fccol] = selection.reset_index()
    
    return selections

foldchangecols = [
    '(P2, WT) / (E14_5, WT)', 
    '(P40, WT) / (P2, WT)', 
    '(P2, KO) / (E14_5, KO)', 
    '(P40, KO) / (P2, KO)'
]
selections = compute_selections(
    hilic,
    selected_pathways,
    'KEGG',
    foldchangecols,
    [col + '.1' for col in foldchangecols]
)
selections

{'(P2, WT) / (E14_5, WT)':     index     mapid                                               name  \
 0       0  mmu00230           Purine metabolism - Mus musculus (mouse)   
 1       1  mmu01040  Biosynthesis of unsaturated fatty acids - Mus ...   
 2       2  mmu04022  cGMP-PKG signaling pathway - Mus musculus (mouse)   
 3       3  mmu00030   Pentose phosphate pathway - Mus musculus (mouse)   
 4       4  mmu00240       Pyrimidine metabolism - Mus musculus (mouse)   
 5       5  mmu02010            ABC transporters - Mus musculus (mouse)   
 6       6  mmu04024      cAMP signaling pathway - Mus musculus (mouse)   
 7       7  mmu00562  Inositol phosphate metabolism - Mus musculus (...   
 8       8  mmu00340        Histidine metabolism - Mus musculus (mouse)   
 9       9  mmu00730         Thiamine metabolism - Mus musculus (mouse)   
 10     10  mmu00591    Linoleic acid metabolism - Mus musculus (mouse)   
 11     11  mmu00040  Pentose and glucuronate interconversions - Mus...   

In [76]:
prefix = '../raw/enriched_pathways_hilic_fc'
for key, selection in selections.items():
    suffix = key[1:-1] \
        .replace(', ', '_') \
        .replace(') / (', '_')
    
    selection = selection.copy()
    
    # change mmu to map in order to conform with iPATH
    selection.loc['mapid'] = selection.mapid.str.replace('mmu', 'map')
    selection.loc[:, ['mapid', 'median_foldchange']].to_csv(
        '_'.join([prefix, suffix]) + '.tsv',
        sep = '\t',
        header = False,
        index = False
    )