In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [14]:
df = pd.read_excel('071621PsletcherLipidomics1_10.xls', header=16)
controls = ['C1', 'C5', 'C6', 'C7', 'C9']
intralipids = ['IL1', 'IL2', 'IL3', 'IL4', 'IL5']
sample_names = controls + intralipids
groups = {'C': controls, 'IL': intralipids}
num_fatty_acid_columns = 4

In [15]:
def strfilter(func, s: str) -> str:
    """
    Return a string containing all characters in s such that func(s) is true.
    """
    return ''.join(i for i in s if func(i))
def average_samples(data):
    for k, v in groups.items():
        if len(v) > 1:
            data[k] = data.loc[:, v].mean(axis=1)
    return data
def name_map(name: str) -> str:
    for k, v in groups.items():
        if name in v:
            return k
    return name

In [16]:
# constants
phospholipids = ['PC', 'PE', 'LPC', 'LPE', 'PS', 'LPS', 'PG', 'LPG', 'PA',
                 'LPA', 'PI', 'LPI', 'PIP', 'PIP2', 'PIP3', 'CL', 'PGP',
                 'PPA', 'CDP-DG']
atom_masses = {'C': 12.011, 'N': 14.007, 'H': 1.008, 'O': 15.999, 'P': 30.974}
fa = {'18:1': 'oleic acid', '16:0': 'palmitic acid', '18:2': 'linoleic acid',
      '16:1': 'palmitoleic acid', '18:0': 'stearic acid', '20:4': 'ETA',
      '14:0': 'myristic acid'}
sample_columns = ['Area[c-' + str(sample + 1) + ']' for sample in range(0, len(sample_names))]
fa_columns = ['FA' + str(i) for i in range(1, num_fatty_acid_columns + 1)]

In [17]:
def sum_areas(data):
    """
    Sum the areas corresponding to each fatty acid.
    
    Args:
        data: a pandas dataframe with lipids as index and columns corresponding to
        the fatty acids in each lipid and the area calculated for that lipid in each sample
    Returns:
        a new pandas dataframe with fatty acids as index and columns for each sample, where the
        value at df.loc[fatty_acid, sample] is the sum of the areas for that sample in rows of
        data where fatty_acid is one of the fatty acids in the lipid, counting multiple times if
        the fatty acid appears multiple times in that lipid.
    """
    # we treat '(12:2p)' and '(12:2)' as the same fatty acid
    data.loc[:, fa_columns] = data.loc[:, fa_columns].applymap(standardize_formula)
    dfs = [data[sample_columns].groupby(data[i]).sum()
           for i in fa_columns]
    return pd.concat(dfs).groupby(level=0).sum()

In [18]:
def how_much_saturated(formula: str) -> str:
    """
    From a formula of the form '(14:2)', extract the number of double bonds and return
    the saturation level ('saturated', 'mono', or 'poly').
    """
    double_bonds = int(formula.split(':')[1].split(' ')[0])
    if double_bonds == 0:
        return 'saturated'
    elif double_bonds == 1:
        return 'mono'
    else:
        return 'poly'

In [19]:
def standardize_formula(formula: str) -> str:
    """
    Transform formulas of the form '(12:20p)' to the form '12:20'.
    """
    if pd.isna(formula):
        return formula
    a, b = formula.split(':')
    a = strfilter(str.isdecimal, a)
    b = strfilter(str.isdecimal, b)
    return a + ':' + b

In [20]:
def rename_parts(data):
    """
    Replace the column names 'Area[c-1]', 'Area[c-2]', etc with names corresponding to each sample,
    and add the names of some common fatty acids to their formulas.
    """
    data = data.rename(index={i: i + ' (' + fa[i] + ')'
                       if i in fa.keys()
                       else i for i in data.index})
    data.columns = sample_names
    return data

In [21]:
df_phospho = df[df['Class'].map(lambda x: x in phospholipids)].reset_index()
df_TG = df[df['Class'] == 'TG'].reset_index()
areas_tg = rename_parts(sum_areas(df_TG))
areas_phospho = rename_parts(sum_areas(df_phospho))

In [22]:
a = areas_tg.groupby([how_much_saturated(x) for x in areas_tg.index]).sum()
a = a / a.sum()

In [23]:
b = areas_phospho.groupby([how_much_saturated(x) for x in areas_phospho.index]).sum()
b = b / b.sum()

In [24]:
a.to_excel("triglycerides_saturated.xlsx")
b.to_excel("phospholipids_saturated.xlsx")
areas_tg.to_excel("triglycerides_areas.xlsx")
areas_phospho.to_excel("phospholipids_areas.xlsx")