# Using MEFISTO to analyse metabolomics data

In [6]:
import pandas as pd
import csv
from thefuzz import fuzz

def get_unambiguous_column_names(column_names):
    series = pd.Series(column_names)
    series[series.duplicated(keep = 'first')] = series[series.duplicated(keep = 'first')].apply(
        lambda x: x + '.1'
    )
    return series.to_list()

def parse_metabolomics_results(filename, delimiter = ',', quotechar = '"'):
    with open(filename) as csvfile:
        data = []
        entryid = None
        csvreader = csv.reader(
            csvfile,
            delimiter = delimiter,
            quotechar = quotechar
        )
        
        # get column names and make them unambiguous
        names = []
        for i in range(2):
            names = csvreader.__next__()
        
        names = get_unambiguous_column_names(names)
        
        add_names = [
            'Name2', 
            'Molecular Weight', 
            'RT [min]2', 
            'DeltaMass [ppm]', 
            'DBID', 
            'Reference List Name', 
            'mzLogic Score', 
            'ChemSpider ID', 
            'KEGG', 'HMDB', 
            'Mass List Search Results ID'
        ]
        add_names_set = set([k if not k in {'Name2', 'RT [min]2'} else k[:-1] for k in add_names]) # faster check
        series = None
        tmp_names = []
        add_rows = []
        
        def insert_addrow_into_series(series, names, values, add_names_set):
            series = series.copy()
            for k, v in zip(names, values):
                if k in add_names_set:
                    k = k + '2' if k in {'Name', 'RT [min]'} else k
                    series[k] = v if v else None

                elif k == 'KEGG ID':
                    series['KEGG'] = v if v else None
            
            return series
            
        for line in csvreader:
            if line[0]:
                if add_rows:
                    exact_match = False
                    partial_match = False
                    for row in add_rows:
                        if row[0] == series.Name:
                            data.append(
                                insert_addrow_into_series(
                                    series,
                                    tmp_names,
                                    row,
                                    add_names_set
                                )
                            )
                            exact_match = True
                            break
                    
                    if not exact_match:
                        for row in add_rows:
                            if fuzz.partial_ratio(row[0].lower(), series.Name.lower()) > 95:
                                data.append(
                                    insert_addrow_into_series(
                                        series,
                                        tmp_names,
                                        row,
                                        add_names_set
                                    )
                                )
                                partial_match = True
                                break
                    
                    if not partial_match:
                        data.append(series.copy())
                
                elif isinstance(series, pd.Series):
                    data.append(series.copy())
                    
                entryid = line[0]
                add_rows = []
                series = pd.Series(
                    {k: v for k, v in zip(names, line)}
                )
                
            elif line[1] == 'Name':
                tmp_names = [name for name in line if name]
            
            else:
                add_rows.append(line[1: 1 + len(tmp_names)])
    
    return pd.DataFrame(data, columns = names + add_names)

def clean_dataframe(df):
    no_kegg_rows = df.KEGG.isna()
    no_kegg = df.loc[no_kegg_rows, :].copy()
    kegg = df.loc[~no_kegg_rows, :].copy()
    for df in [kegg, no_kegg]:
        df.drop_duplicates(
            subset = ['ID', 'KEGG'],
            inplace = True
        )
    
    no_kegg = no_kegg.loc[~no_kegg.ID.isin(kegg.ID), :]
    return kegg, no_kegg

In [7]:
hilic = parse_metabolomics_results('../raw/Results_Untargeted_Metabolomics_E14-P40_LK_Nova_results_HILIC.csv')
hilic, no_kegg_hilic = clean_dataframe(hilic)
rp = parse_metabolomics_results('../raw/Results_Untargeted_Metabolomics_E14-P40_LK_Nova_results_RP.csv')
rp, no_kegg_rp = clean_dataframe(rp)

In [8]:
# combine hilic and rp in two different ways
def get_clean_coarse_group(x):
    x = x.split('/')[0].strip().lower()
    if x.startswith('bio'):
        x = 'bioenergetic process'
        
    elif x.startswith('nucleoside'):
        x = 'nucleoside metabolism'
    
    elif x.startswith('glycolysis'):
        x = 'glycolysis'
    
    elif x.startswith('lipid'):
        x = 'lipid metablism'
    
    elif x.startswith('purine'):
        x = 'nucleoside metabolism'
        
    elif x.startswith('aa'):
        x = 'aa metabolism'
        
    else:
        x = 'other'
        
    return x

basedata = {}
for key in ['RP', 'HILIC']:
    base = pd.read_csv(
        f'../raw/untargeted_Metabo_Results_LKnaus_annotated_DM_clean_annot_{key}.csv',
        header = 1
    ).loc[:, ['ID', 'Name Metabolite ', 'Group ']]
    base.columns = ['ID', 'metabolitename', 'metabolitegroupfine']
    base['metabolitegroupcoarse'] = base.metabolitegroupfine.apply(
        get_clean_coarse_group
    )

    frames = []
    for df in [hilic, no_kegg_hilic, rp, no_kegg_rp]:
        frames.append(
            base.merge(
                df,
                on = 'ID',
                how = 'inner'
            )
        )
    
    df = pd.concat(frames).reset_index(drop = True)
    df.index = df.ID.to_list()
    basedata[key] = df

In [24]:
data = basedata['HILIC']
data_cols = data.columns[data.columns.str.match('S[0-9]{2}_[EP][0-9][_0-9]+WT[0-9]')].to_list()
data_cols += data.columns[data.columns.str.match('S[0-9]{2}_[EP][0-9][_0-9]+KO[0-9]')].to_list()
df = pd.DataFrame(
    data[data_cols].astype(float).values,
    index = data.index,
    columns = [col[4:] for col in data_cols]
)
df

Unnamed: 0,P40_WT1,P40_WT2,P40_WT3,P40_WT4,P2_WT1,P2_WT2,P2_WT3,P2_WT4,E14_5_WT1,E14_5_WT2,...,P40_KO3,P40_KO4,P2_KO1,P2_KO2,P2_KO3,P2_KO4,E14_5_KO1,E14_5_KO2,E14_5_KO3,E14_5_KO4
A002_HILIC,2.300000e+09,2.400000e+09,2.100000e+09,2.100000e+09,2.100000e+09,1.900000e+09,1.800000e+09,1.900000e+09,2.800000e+09,2.800000e+09,...,2.000000e+09,2.200000e+09,2.000000e+09,1.600000e+09,1.700000e+09,1.900000e+09,2.800000e+09,2.300000e+09,2.600000e+09,2.900000e+09
A054_HILIC,1.200000e+09,2.000000e+09,2.300000e+09,2.200000e+09,9.800000e+08,9.900000e+08,7.400000e+08,8.000000e+08,5.200000e+08,1.100000e+09,...,2.300000e+09,1.900000e+09,1.700000e+09,1.000000e+09,1.300000e+09,1.400000e+09,9.800000e+08,8.900000e+08,2.200000e+09,1.800000e+09
A062_HILIC,5.800000e+07,8.000000e+07,8.800000e+07,6.800000e+07,2.200000e+08,2.400000e+08,2.100000e+08,2.200000e+08,2.200000e+08,2.800000e+08,...,1.200000e+08,6.800000e+07,4.000000e+08,3.700000e+08,3.200000e+08,3.800000e+08,2.400000e+08,2.700000e+08,3.800000e+08,2.600000e+08
B122_HILIC,1.100000e+11,1.200000e+11,1.300000e+11,1.200000e+11,1.000000e+11,9.900000e+10,9.400000e+10,9.700000e+10,1.300000e+11,1.200000e+11,...,1.200000e+11,1.200000e+11,1.000000e+11,9.600000e+10,9.600000e+10,1.000000e+11,1.200000e+11,1.200000e+11,9.800000e+10,1.100000e+11
A012_HILIC,1.100000e+09,1.200000e+09,1.200000e+09,1.100000e+09,8.000000e+08,6.600000e+08,6.400000e+08,7.800000e+08,1.100000e+09,1.300000e+09,...,1.100000e+09,1.000000e+09,7.000000e+08,6.100000e+08,6.300000e+08,7.400000e+08,1.100000e+09,8.800000e+08,7.900000e+08,6.600000e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C166_RP,7.900000e+07,8.200000e+07,8.700000e+07,7.500000e+07,1.600000e+07,9.500000e+06,1.100000e+07,1.100000e+07,1.100000e+07,1.200000e+07,...,9.500000e+07,1.000000e+08,1.600000e+07,1.200000e+07,1.500000e+07,1.700000e+07,7.200000e+06,1.100000e+07,1.500000e+07,8.900000e+06
C164_RP,6.400000e+07,6.800000e+07,7.300000e+07,7.800000e+07,2.600000e+07,2.000000e+07,1.900000e+07,2.400000e+07,3.400000e+07,8.700000e+07,...,1.100000e+08,1.100000e+08,2.600000e+07,2.100000e+07,2.400000e+07,3.700000e+07,2.400000e+07,7.800000e+07,7.900000e+07,4.400000e+07
C161_RP,3.900000e+06,4.200000e+06,4.100000e+06,3.700000e+06,7.100000e+07,5.400000e+07,5.400000e+07,5.600000e+07,1.700000e+07,1.400000e+07,...,5.800000e+06,4.800000e+06,6.300000e+07,5.700000e+07,6.300000e+07,5.400000e+07,1.500000e+07,9.400000e+06,1.200000e+07,1.000000e+07
C116_RP,4.800000e+08,5.200000e+08,5.300000e+08,5.300000e+08,8.100000e+08,8.300000e+08,8.000000e+08,8.300000e+08,8.800000e+08,8.200000e+08,...,5.900000e+08,5.200000e+08,7.100000e+08,7.000000e+08,7.200000e+08,8.000000e+08,8.300000e+08,6.600000e+08,6.400000e+08,6.800000e+08
